Edinburgh Speech Tools 2.4-release
EST_Token.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : April 1996 */
35/*-----------------------------------------------------------------------*/
36/* Token/Tokenizer class */
37/* */
38/*=======================================================================*/
39
40#ifndef __EST_TOKEN_H__
41#define __EST_TOKEN_H__
42
43#include <cstdio>
44
45using namespace std;
46
47#include "EST_String.h"
48#include "EST_common.h"
49
50// I can never really remember this so we'll define it here
51/// The default whitespace characters
52extern const EST_String EST_Token_Default_WhiteSpaceChars;
53///
54extern const EST_String EST_Token_Default_SingleCharSymbols;
55///
56extern const EST_String EST_Token_Default_PunctuationSymbols;
57///
58extern const EST_String EST_Token_Default_PrePunctuationSymbols;
59
60/** This class is similar to \Ref{EST_String} but also maintains
61 the original punctuation and whitespace found around the
62 token.
63
64 \Ref{EST_Token}'s primary use is with \Ref{EST_TokenStream} class
65 which allows easy tokenizing of ascii files.
66
67 A token consists of four parts, any of which may be empty: a
68 name, the actual token, preceding whitespace, preceding
69 punctuation, the name and succeeding punctuation.
70
71 @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
72*/
73class EST_Token {
74 private:
75 EST_String space;
76 EST_String prepunc;
77 EST_String pname;
78 EST_String punc;
79 int linenum;
80 int linepos;
81 int p_filepos;
82 int p_quoted;
83
84 public:
85 ///
86 EST_Token() {init();}
87 ///
88 EST_Token(const EST_String p) {init(); pname = p; }
89 ///
90 void init() {p_quoted=linenum=linepos=p_filepos=0;}
91
92 /**@name Basic access to fields */
93 //@{
94 /// set token from a string
95 void set_token(const EST_String &p) { pname = p; }
96 ///
97 void set_token(const char *p) { pname = p; }
98 /// set whitespace of token.
99 void set_whitespace(const EST_String &p) { space = p; }
100 ///
101 void set_whitespace(const char *p) { space = p; }
102 /// set (post) punctuation of token.
103 void set_punctuation(const EST_String &p) { punc = p; }
104 ///
105 void set_punctuation(const char *p) { punc = p; }
106 /// set prepunction
107 void set_prepunctuation(const EST_String &p) { prepunc = p; }
108 ///
109 void set_prepunctuation(const char *p) { prepunc = p; }
110 ///
111 const EST_String &whitespace() { return space; }
112 ///
113 const EST_String &punctuation() { return punc; }
114 ///
115 const EST_String &prepunctuation() { return prepunc; }
116
117 /**@name Access token as a string */
118 //@{
119 const EST_String &string() const { return String(); }
120 /// Access token as a string
121 const EST_String &S() const { return String(); }
122 /// Access token as a string
123 const EST_String &String() const { return pname; }
124 /// For automatic coercion to \Ref{EST_String}
125 operator EST_String() const { return String(); }
126 //@}
127
128 /**@name Access token as a int */
129 //@{
130 int Int(bool &valid) const { return String().Int(valid); }
131 int Int() const { return String().Int(); }
132 int I(bool &valid) const { return Int(valid); }
133 int I() const { return Int(); }
134 operator int() const { return Int(); }
135 //@}
136
137 /**@name Access token as a long */
138 //@{
139 long Long(bool &valid) const { return String().Long(valid); }
140 long Long() const { return String().Long(); }
141 long L(bool &valid) const { return Long(valid); }
142 long L() const { return Long(); }
143 operator long() const { return Long(); }
144 //@}
145
146 /**@name Access token as a float */
147 //@{
148 float Float(bool &valid) const { return String().Float(valid); }
149 float Float() const { return String().Float(); }
150 float F(bool &valid) const { return Float(valid); }
151 float F() const { return Float(); }
152 operator float() const { return Float(); }
153 //@}
154
155 /**@name Access token as a double */
156 //@{
157 double Double(bool &valid) const { return String().Double(valid); }
158 double Double() const { return String().Double(); }
159 double D(bool &valid) const { return Double(valid); }
160 double D() const { return Double(); }
161 operator double() const { return Double(); }
162 //@}
163
164 //@}
165 //@{
166 /// Note that this token was quoted (or not)
167 void set_quoted(int q) { p_quoted = q; }
168 /// TRUE is token was quoted
169 int quoted() const { return p_quoted; }
170 //@}
171 ///
172 void set_row(int r) { linenum = r; }
173 ///
174 void set_col(int c) { linepos = c; }
175 /// Set file position in original \Ref{EST_TokenStream}
176 void set_filepos(int c) { p_filepos = c; }
177 /// Return lower case version of token name
178 EST_String lstring() { return downcase(pname); }
179 /// Return upper case version of token name
180 EST_String ustring() { return upcase(pname); }
181 /// Line number in original \Ref{EST_TokenStream}.
182 int row(void) const { return linenum; }
183 /// Line position in original \Ref{EST_TokenStream}.
184 int col(void) const { return linepos; }
185 /// file position in original \Ref{EST_TokenStream}.
186 int filepos(void) const { return p_filepos; }
187
188 /// A string describing current position, suitable for error messages
189 const EST_String pos_description() const;
190
191 ///
192 friend ostream& operator << (ostream& s, const EST_Token &p);
193
194 ///
195 EST_Token & operator = (const EST_Token &a);
196 ///
197 EST_Token & operator = (const EST_String &a);
198 ///
199 int operator == (const EST_String &a) { return (pname == a); }
200 ///
201 int operator != (const EST_String &a) { return (pname != a); }
202 ///
203 int operator == (const char *a) { return (strcmp(pname,a)==0); }
204 ///
205 int operator != (const char *a) { return (strcmp(pname,a)!=0); }
206};
207
208enum EST_tokenstream_type {tst_none, tst_file, tst_pipe, tst_string, tst_istream};
209
210/** A class that allows the reading of \Ref{EST_Token}s from a file
211 stream, pipe or string. It automatically tokenizes a file based on
212 user definable whitespace and punctuation.
213
214 The definitions of whitespace and punctuation are user definable.
215 Also support for single character symbols is included. Single
216 character symbols {\em always} are treated as individual tokens
217 irrespective of their white space context. Also a quote
218 mode can be used to read uqoted tokens.
219
220 The setting of whitespace, pre and post punctuation, single character
221 symbols and quote mode must be down (immediately) after opening
222 the stream.
223
224 There is no unget but peek provides look ahead of one token.
225
226 Note there is an interesting issue about what to do about
227 the last whitespace in the file. Should it be ignored or should
228 it be attached to a token with a name string of length zero.
229 In unquoted mode the eof() will return TRUE if the next token name
230 is empty (the mythical last token). In quoted mode the last must
231 be returned so eof will not be raised.
232
233 @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
234*/
236 private:
237 EST_tokenstream_type type;
238 EST_String WhiteSpaceChars;
239 EST_String SingleCharSymbols;
240 EST_String PunctuationSymbols;
241 EST_String PrePunctuationSymbols;
242 EST_String Origin;
243 FILE *fp;
244 istream *is;
245 int fd;
246 char *buffer;
247 int buffer_length;
248 int pos;
249 int linepos;
250 int p_filepos;
251 int getch(void);
252 EST_TokenStream &getch(char &C);
253 int peeked_charp;
254 int peeked_char; // ungot character
255 int peekch(void);
256 int peeked_tokp;
257 int eof_flag;
258 int quotes;
259 char quote;
260 char escape;
261 EST_Token current_tok;
262 void default_values(void);
263 /* local buffers to save reallocating */
264 int tok_wspacelen;
265 char *tok_wspace;
266 int tok_stufflen;
267 char *tok_stuff;
268 int tok_prepuncslen;
269 char *tok_prepuncs;
270 int close_at_end;
271
272 /* character class map */
273 char p_table[256];
274 bool p_table_wrong;
275
276 /** This function is deliberately private so that you'll get a compilation
277 error if you assign a token stream or pass it as an (non-reference)
278 argument. The problem with copying is that you need to copy the
279 filedescriptiors too (which can't be done for pipes). You probably
280 don't really want a copy anyway and meant to pass it as a reference.
281 If you really need this (some sort of clever look ahead) I am not
282 sure what he consequences really are (or how portable they are).
283 Pass the \Ref{EST_TokenStream} by reference instead.
284 */
286
287 void build_table();
288
289 inline int getch_internal();
290 inline int peekch_internal();
291 inline int getpeeked_internal();
292 public:
293 ///
295 /// will close file if appropriate for type
297 //@{
298 /// open a \Ref{EST_TokenStream} for a file.
299 int open(const EST_String &filename);
300 /// open a \Ref{EST_TokenStream} for an already opened file
301 int open(FILE *ofp, int close_when_finished);
302 /// open a \Ref{EST_TokenStream} for an already open istream
303 int open(istream &newis);
304 /// open a \Ref{EST_TokenStream} for string rather than a file
305 int open_string(const EST_String &newbuffer);
306 /// Close stream.
307 void close(void);
308 //@}
309 /**@name stream access functions */
310 //@{
311 /// get next token in stream
313 /// get next token in stream
314 EST_Token &get();
315 /**@name get the next token which must be the argument. */
316 //@{
317 EST_Token &must_get(EST_String expected, bool *ok);
318 EST_Token &must_get(EST_String expected, bool &ok)
319 { return must_get(expected, &ok); }
320 EST_Token &must_get(EST_String expected)
321 { return must_get(expected, (bool *)NULL); }
322 //@}
323 /// get up to {\tt s} in stream as a single token.
324 EST_Token get_upto(const EST_String &s);
325 /// get up to {\tt s} in end of line as a single token.
327 /// peek at next token
328 EST_Token &peek(void);
329 /// Reading binary data, (don't use peek() immediately beforehand)
330 int fread(void *buff,int size,int nitems) EST_WARN_UNUSED_RESULT;
331 //@}
332 /**@name stream initialization functions */
333 //@{
334 /// set which characters are to be treated as whitespace
336 { WhiteSpaceChars = ws; p_table_wrong=1;}
337 /// set which characters are to be treated as single character symbols
339 { SingleCharSymbols = sc; p_table_wrong=1;}
340 /// set which characters are to be treated as (post) punctuation
342 { PunctuationSymbols = ps; p_table_wrong=1;}
343 /// set which characters are to be treated as (post) punctuation
345 { PrePunctuationSymbols = ps; p_table_wrong=1;}
346 /// set characters to be used as quotes and escape, and set quote mode
347 void set_quotes(char q, char e) { quotes = TRUE; quote = q; escape = e; p_table_wrong=1;}
348 /// query quote mode
349 int quoted_mode(void) { return quotes; }
350 //@}
351 /**@name miscellaneous */
352 //@{
353 /// returns line number of \Ref{EST_TokenStream}
354 int linenum(void) const {return linepos;}
355 /// end of file
356 int eof()
357 { return (eof_flag || ((!quotes) && (peek() == ""))); }
358 /// end of line
359 int eoln();
360 /// current file position in \Ref{EST_TokenStream}
361 int filepos(void) const { return (type == tst_string) ? pos : p_filepos; }
362 /// tell, synonym for filepos
363 int tell(void) const { return filepos(); }
364 /// seek, reposition file pointer
365 int seek(int position);
366 int seek_end();
367 /// Reset to start of file/string
368 int restart(void);
369 /// A string describing current position, suitable for error messages
371 /// The originating filename (if there is one)
372 const EST_String filename() const { return Origin; }
373 /// For the people who *need* the actual description (if possible)
374 FILE *filedescriptor() { return (type == tst_file) ? fp : 0; }
375 ///
376 EST_TokenStream & operator >>(EST_Token &p);
377 ///
378 EST_TokenStream & operator >>(EST_String &p);
379 ///
380 friend ostream& operator <<(ostream& s, EST_TokenStream &p);
381 //@}
382};
383
384/** Quote a string with given quotes and escape character
385*/
386EST_String quote_string(const EST_String &s,
387 const EST_String &quote = "\"",
388 const EST_String &escape = "\\",
389 int force=0);
390
391#endif // __EST_TOKEN_H__
int restart(void)
Reset to start of file/string.
Definition: EST_Token.cc:437
int filepos(void) const
current file position in \Ref{EST_TokenStream}
Definition: EST_Token.h:361
int eof()
end of file
Definition: EST_Token.h:356
EST_Token & get()
get next token in stream
Definition: EST_Token.cc:710
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:338
EST_Token get_upto(const EST_String &s)
get up to {\tt s} in stream as a single token.
Definition: EST_Token.cc:492
int fread(void *buff, int size, int nitems) EST_WARN_UNUSED_RESULT
Reading binary data, (don't use peek() immediately beforehand)
Definition: EST_Token.cc:355
const EST_String filename() const
The originating filename (if there is one)
Definition: EST_Token.h:372
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:344
~EST_TokenStream()
will close file if appropriate for type
Definition: EST_Token.cc:167
const EST_String pos_description()
A string describing current position, suitable for error messages.
Definition: EST_Token.cc:875
int open_string(const EST_String &newbuffer)
open a \Ref{EST_TokenStream} for string rather than a file
Definition: EST_Token.cc:251
int linenum(void) const
returns line number of \Ref{EST_TokenStream}
Definition: EST_Token.h:354
void set_quotes(char q, char e)
set characters to be used as quotes and escape, and set quote mode
Definition: EST_Token.h:347
EST_Token get_upto_eoln(void)
get up to {\tt s} in end of line as a single token.
Definition: EST_Token.cc:516
int eoln()
end of line
Definition: EST_Token.cc:818
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:341
EST_Token & peek(void)
peek at next token
Definition: EST_Token.cc:830
void close(void)
Close stream.
Definition: EST_Token.cc:406
int tell(void) const
tell, synonym for filepos
Definition: EST_Token.h:363
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
FILE * filedescriptor()
For the people who need the actual description (if possible)
Definition: EST_Token.h:374
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:335
int seek(int position)
seek, reposition file pointer
Definition: EST_Token.cc:305
int quoted_mode(void)
query quote mode
Definition: EST_Token.h:349
int quoted() const
TRUE is token was quoted.
Definition: EST_Token.h:169
EST_String ustring()
Return upper case version of token name.
Definition: EST_Token.h:180
int col(void) const
Line position in original \Ref{EST_TokenStream}.
Definition: EST_Token.h:184
const EST_String & String() const
Access token as a string.
Definition: EST_Token.h:123
void set_prepunctuation(const EST_String &p)
set prepunction
Definition: EST_Token.h:107
void set_whitespace(const EST_String &p)
set whitespace of token.
Definition: EST_Token.h:99
void set_punctuation(const EST_String &p)
set (post) punctuation of token.
Definition: EST_Token.h:103
const EST_String pos_description() const
A string describing current position, suitable for error messages.
Definition: EST_Token.cc:107
void set_filepos(int c)
Set file position in original \Ref{EST_TokenStream}.
Definition: EST_Token.h:176
EST_String lstring()
Return lower case version of token name.
Definition: EST_Token.h:178
void set_quoted(int q)
Note that this token was quoted (or not)
Definition: EST_Token.h:167
int row(void) const
Line number in original \Ref{EST_TokenStream}.
Definition: EST_Token.h:182
const EST_String & S() const
Access token as a string.
Definition: EST_Token.h:121
void set_token(const EST_String &p)
set token from a string
Definition: EST_Token.h:95
int filepos(void) const
file position in original \Ref{EST_TokenStream}.
Definition: EST_Token.h:186