Edinburgh Speech Tools 2.4-release
EST_SCFG.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : October 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* Stochastic context free grammars */
38/* */
39/*=======================================================================*/
40#ifndef __EST_SCFG_H__
41#define __EST_SCFG_H__
42
43#include "EST_simplestats.h"
44#include "EST_rw_status.h"
45#include "EST_TList.h"
46#include "siod.h"
47
48/** This class represents a bracketed string used in training of SCFGs.
49
50 An object in this class builds an index of valid bracketing of
51 the string, thus offering both a tree like access and direct
52 access to the leafs of the tree. The definition of ``valid
53 bracketing'' is any substring \[ W_{i,j} \] that doesn't cross any
54 brackets.
55*/
57 private:
58 int p_length;
59 LISP *symbols;
60 LISP bs;
61 int **valid_spans; // triangular matrix
62 int find_num_nodes(LISP string);
63 int set_leaf_indices(LISP string,int i,LISP *symbols);
64 int num_leafs(LISP l) const;
65 void find_valid(int i,LISP t) const;
66 void init();
67 public:
68 ///
70 ///
71 EST_bracketed_string(LISP string);
72 ///
74
75 ///
76 void set_bracketed_string(LISP string);
77 ///
78 int length() const {return p_length;}
79 ///
80 LISP string() const { return bs; }
81 /// The nth symbol in the string.
82 const EST_String symbol_at(int i) const
83 { return EST_String(get_c_string(car(symbols[i]))); }
84 /// If a bracketing from i to k is valid in string
85 int valid(int i,int k) const { return valid_spans[i][k]; }
86
87 ///
88 int operator !=(const EST_bracketed_string &a) const
89 { return (!(this == &a)); }
90 int operator ==(const EST_bracketed_string &a) const
91 { return ((this == &a)); }
92 ///
93 friend ostream& operator << (ostream &s, const EST_bracketed_string &a)
94 { (void)a; s << "[a bracketed string]" << endl; return s; }
95
96};
97
99
100// Only support Chomsky Normal Form at present
101enum est_scfg_rtype {est_scfg_unset, est_scfg_binary_rule,
102 est_scfg_unary_rule};
103
104/** A stochastic context free grammar rule.
105
106 At present only two types of rule are supported:
107 {\tt est\_scfg\_binary\_rule} and {\tt est\_scfg\_unary\_rule}.
108 This is sufficient for the representation of grammars in
109 Chomsky Normal Form. Each rule also has a probability associated
110 with it. Terminals and noterminals are represented as ints using
111 the \Ref{EST_Discrete}s in \Ref{EST_SCFG} to reference the actual
112 alphabets.
113
114 Although this class includes a ``probability'' nothing in the rule
115 itself enforces it to be a true probability. It is responsibility
116 of the classes that use this rule to enforce that condition if
117 desired.
118
119 @author Alan W Black (awb@cstr.ed.ac.uk): October 1997
120*/
122 private:
123 int p_mother;
124 int p_daughter1;
125 int p_daughter2;
126 est_scfg_rtype p_type;
127 double p_prob;
128 public:
129 ///
130 EST_SCFG_Rule() {p_type=est_scfg_unset; p_prob=0;}
131 ///
133 {p_mother = r.p_mother; p_daughter1 = r.p_daughter1;
134 p_daughter2 = r.p_daughter2; p_type=r.p_type; p_prob = r.p_prob;}
135 /// Create a unary rule.
136 EST_SCFG_Rule(double prob,int p,int m);
137 /// Create a binary rule.
138 EST_SCFG_Rule(double prob,int p, int q, int r);
139 /// The rule's probability
140 double prob() const {return p_prob;}
141 /// set the probability
142 void set_prob(double p) { p_prob=p;}
143 /// rule type
144 est_scfg_rtype type() const { return p_type; }
145 ///
146 int mother() const {return p_mother;}
147 /** In a unary rule this is a terminal, in a binary rule it
148 is a nonterminal
149 */
150 int daughter1() const {return p_daughter1;}
151 ///
152 int daughter2() const {return p_daughter2;}
153 ///
154 void set_rule(double prob,int p, int m);
155 ///
156 void set_rule(double prob,int p, int q, int r);
157};
158
160
161/** A class representing a stochastic context free grammar (SCFG).
162
163 This class includes the representation of the grammar itself and
164 methods for training and testing it against some corpus.
165
166 At presnet of grammars in Chomsky Normal Form are supported. That
167 is rules may be binary or unary. If binary the mother an two
168 daughters are nonterminals, if unary the mother must be nonterminal
169 and daughter a terminal symbol.
170
171 The terminals and nonterminals symbol sets are derived automatically
172 from the LISP representation of the rules at initialization time
173 and are represented as \Ref{EST_Discrete}s. The distinguished
174 symbol is assumed to be the first mother of the first rule in
175 the given grammar.
176
177*/
178class EST_SCFG {
179 private:
180 EST_Discrete nonterminals;
181 EST_Discrete terminals;
182 int p_distinguished_symbol;
183 // Index of probabilities for binary rules in grammar
184 double ***p_prob_B;
185 // Index of probabilities for unary rules in grammar
186 double **p_prob_U;
187 // Build rule probability caches
188 void rule_prob_cache();
189 // Delete rule probability caches
190 void delete_rule_prob_cache();
191 public:
192 /**@name Constructor and initialisation functions */
193 //@{
194 EST_SCFG();
195 /// Initialize from a set of rules
196 EST_SCFG(LISP rules);
197 ~EST_SCFG();
198 //@}
199
200 /**@name utility functions */
201 //@{
202 /// Set (or reset) rules from external source after construction
203 void set_rules(LISP rules);
204 /// Return rules as LISP list.
205 LISP get_rules();
206 /// The rules themselves
208 int distinguished_symbol() const { return p_distinguished_symbol; }
209 /** Find the terminals and nonterminals in the given grammar, adding
210 them to the appropriate given string lists.
211 */
213 /// Convert nonterminal index to string form
214 EST_String nonterminal(int p) const { return nonterminals.name(p); }
215 /// Convert terminal index to string form
216 EST_String terminal(int m) const { return terminals.name(m); }
217 /// Convert nonterminal string to index
218 int nonterminal(const EST_String &p) const { return nonterminals.name(p); }
219 /// Convert terminal string to index
220 int terminal(const EST_String &m) const { return terminals.name(m); }
221 /// Number of nonterminals
222 int num_nonterminals() const { return nonterminals.length(); }
223 /// Number of terminals
224 int num_terminals() const { return terminals.length(); }
225 /// The rule probability of given binary rule
226 double prob_B(int p, int q, int r) const { return p_prob_B[p][q][r]; }
227 /// The rule probability of given unary rule
228 double prob_U(int p, int m) const { return p_prob_U[p][m]; }
229 /// (re-)set rule probability caches
230 void set_rule_prob_cache();
231 //@}
232
233 /**@name file i/o functions */
234 //@{
235 /// Load grammar from named file
236 EST_read_status load(const EST_String &filename);
237 /// Save current grammar to named file
238 EST_write_status save(const EST_String &filename);
239 //@}
240};
241
242/** A class used to train (and test) SCFGs is an extension of
243 \Ref{EST_SCFG}.
244
245 This offers an implementation of Pereira and Schabes ``Inside-Outside
246 reestimation from partially bracket corpora.'' ACL 1992.
247
248 A SCFG maybe trained from a corpus (optionally) containing brackets
249 over a series of passes reestimating the grammar probabilities
250 after each pass. This basically extends the \Ref{EST_SCFG} class
251 adding support for a bracket corpus and various indexes for efficient
252 use of the grammar.
253*/
255 private:
256 /// Index for inside probabilities
257 double ***inside;
258 /// Index for outside probabilities
259 double ***outside;
260 EST_Bcorpus corpus;
261 /// Partial (numerator) for reestimation
262 EST_DVector n;
263 /// Partial (denominator) for reestimation
264 EST_DVector d;
265
266 /// Calculate inside probability.
267 double f_I_cal(int c, int p, int i, int k);
268 /// Lookup or calculate inside probability.
269 double f_I(int c, int p, int i, int k)
270 { double r;
271 if ((r=inside[p][i][k]) != -1) return r;
272 else return f_I_cal(c,p,i,k); }
273 /// Calculate outside probability.
274 double f_O_cal(int c, int p, int i, int k);
275 /// Lookup or calculate outside probability.
276 double f_O(int c, int p, int i, int k)
277 { double r;
278 if ((r=outside[p][i][k]) != -1) return r;
279 else return f_O_cal(c,p,i,k); }
280 /// Find probability of parse of corpus sentence {\tt c}
281 double f_P(int c);
282 /** Find probability of parse of corpus sentence {\tt c} for
283 nonterminal {\tt p}
284 */
285 double f_P(int c,int p);
286 /// Re-estimate probability of binary rule using inside-outside algorithm
287 void reestimate_rule_prob_B(int c, int ri, int p, int q, int r);
288 /// Re-estimate probability of unary rule using inside-outside algorithm
289 void reestimate_rule_prob_U(int c, int ri, int p, int m);
290 /// Do grammar re-estimation
291 void reestimate_grammar_probs(int passes,
292 int startpass,
293 int checkpoint,
294 int spread,
295 const EST_String &outfile);
296 ///
297 double cross_entropy();
298 /// Initialize the cache for inside/outside values for sentence {\tt c}
299 void init_io_cache(int c,int nt);
300 /// Clear the cache for inside/outside values for sentence {\tt c}
301 void clear_io_cache(int c);
302 public:
305
306 /** Test the current grammar against the current corpus print summary.
307
308 Cross entropy measure only is given.
309 */
310 void test_corpus();
311 /** Test the current grammar against the current corpus.
312
313 Summary includes percentage of cross bracketing accuracy
314 and percentage of fully correct parses.
315 */
316 void test_crossbrackets();
317
318 /** Load a corpus from the given file.
319
320 Each sentence in the corpus should be contained in parentheses.
321 Additional parenthesis may be used to denote phrasing within
322 a sentence. The corpus is read using the LISP reader so LISP
323 conventions shold apply, notable single quotes should appear
324 within double quotes.
325 */
326 void load_corpus(const EST_String &filename);
327
328 /** Train a grammar using the loaded corpus.
329
330 @param passes the number of training passes desired.
331 @param startpass from which pass to start from
332 @param checkpoint save the grammar every n passes
333 @param spread Percentage of corpus to use on each pass,
334 this cycles through the corpus on each pass.
335 */
336 void train_inout(int passes,
337 int startpass,
338 int checkpoint,
339 int spread,
340 const EST_String &outfile);
341};
342
343/** From a full parse, extract the string with bracketing only.
344*/
345LISP scfg_bracketing_only(LISP parse);
346/** Cummulate cross bracketing information between ref and test.
347 */
348void count_bracket_crossing(const EST_bracketed_string &ref,
349 const EST_bracketed_string &test,
350 EST_SuffStats &vs);
351
352#endif
const EST_String & name(const int n) const
The name given the index.
const int length(void) const
The number of members in the discrete.
int daughter1() const
Definition: EST_SCFG.h:150
double prob() const
The rule's probability.
Definition: EST_SCFG.h:140
void set_prob(double p)
set the probability
Definition: EST_SCFG.h:142
est_scfg_rtype type() const
rule type
Definition: EST_SCFG.h:144
void train_inout(int passes, int startpass, int checkpoint, int spread, const EST_String &outfile)
void load_corpus(const EST_String &filename)
int terminal(const EST_String &m) const
Convert terminal string to index.
Definition: EST_SCFG.h:220
EST_String nonterminal(int p) const
Convert nonterminal index to string form.
Definition: EST_SCFG.h:214
int num_nonterminals() const
Number of nonterminals.
Definition: EST_SCFG.h:222
int num_terminals() const
Number of terminals.
Definition: EST_SCFG.h:224
double prob_B(int p, int q, int r) const
The rule probability of given binary rule.
Definition: EST_SCFG.h:226
void find_terms_nonterms(EST_StrList &nt, EST_StrList &t, LISP rules)
Definition: EST_SCFG.cc:91
EST_read_status load(const EST_String &filename)
Load grammar from named file.
Definition: EST_SCFG.cc:193
LISP get_rules()
Return rules as LISP list.
Definition: EST_SCFG.cc:169
void set_rule_prob_cache()
(re-)set rule probability caches
Definition: EST_SCFG.cc:256
SCFGRuleList rules
The rules themselves.
Definition: EST_SCFG.h:207
EST_write_status save(const EST_String &filename)
Save current grammar to named file.
Definition: EST_SCFG.cc:204
EST_String terminal(int m) const
Convert terminal index to string form.
Definition: EST_SCFG.h:216
void set_rules(LISP rules)
Set (or reset) rules from external source after construction.
Definition: EST_SCFG.cc:120
double prob_U(int p, int m) const
The rule probability of given unary rule.
Definition: EST_SCFG.h:228
int nonterminal(const EST_String &p) const
Convert nonterminal string to index.
Definition: EST_SCFG.h:218
const EST_String symbol_at(int i) const
The nth symbol in the string.
Definition: EST_SCFG.h:82
int valid(int i, int k) const
If a bracketing from i to k is valid in string.
Definition: EST_SCFG.h:85