libdap Updated for version 3.20.11
libdap4 is an implementation of OPeNDAP's DAP protocol.
escaping.cc
1
2// -*- mode: c++; c-basic-offset:4 -*-
3
4// This file is part of libdap, A C++ implementation of the OPeNDAP Data
5// Access Protocol.
6
7// Copyright (c) 2002,2003 OPeNDAP, Inc.
8// Author: James Gallagher <jgallagher@opendap.org>
9//
10// This library is free software; you can redistribute it and/or
11// modify it under the terms of the GNU Lesser General Public
12// License as published by the Free Software Foundation; either
13// version 2.1 of the License, or (at your option) any later version.
14//
15// This library is distributed in the hope that it will be useful,
16// but WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18// Lesser General Public License for more details.
19//
20// You should have received a copy of the GNU Lesser General Public
21// License along with this library; if not, write to the Free Software
22// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23//
24// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
25
26// Copyright (c) 1996, California Institute of Technology.
27// ALL RIGHTS RESERVED. U.S. Government Sponsorship acknowledged.
28//
29// Please read the full copyright notice in the file COPYRIGHT_URI
30// in this directory.
31//
32// Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
33// Todd.K.Karakashian@jpl.nasa.gov
34//
35// $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
36//
37// These two routines are for escaping/unescaping strings that are identifiers
38// in DAP2
39// id2www() -- escape (using WWW hex codes) non-allowable characters in a
40// DAP2 identifier
41// www2id() -- given an WWW hexcode escaped identifier, restore it
42//
43// These two routines are for escaping/unescaping strings storing attribute
44// values. They use traditional octal escapes (\nnn) because they are
45// intended to be viewed by a user
46// escattr() -- escape (using traditional octal backslash) non-allowable
47// characters in the value of a DAP2 attribute
48// unescattr() -- given an octally escaped string, restore it
49//
50// These are routines used by the above, not intended to be called directly:
51//
52// hexstring()
53// unhexstring()
54// octstring()
55// unoctstring()
56//
57// -Todd
58
59#include "config.h"
60
61#include <ctype.h>
62
63#include <iomanip>
64#include <string>
65#include <sstream>
66
67#include "GNURegex.h"
68#include "Error.h"
69#include "InternalErr.h"
70//#define DODS_DEBUG
71#include "debug.h"
72
73using namespace std;
74
75namespace libdap {
76
77// The next four functions were originally defined static, but I removed that
78// to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
79// jhrg
80
81string
82hexstring(unsigned char val)
83{
84 ostringstream buf;
85 buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val);
86
87 return buf.str();
88}
89
90string
91unhexstring(string s)
92{
93 int val;
94 istringstream ss(s);
95 ss >> hex >> val;
96 char tmp_str[2];
97 tmp_str[0] = static_cast<char>(val);
98 tmp_str[1] = '\0';
99 return string(tmp_str);
100}
101
102string
103octstring(unsigned char val)
104{
105 ostringstream buf;
106 buf << oct << setw(3) << setfill('0')
107 << static_cast<unsigned int>(val);
108
109 return buf.str();
110}
111
112string
113unoctstring(string s)
114{
115 int val;
116
117 istringstream ss(s);
118 ss >> oct >> val;
119
120 DBG(cerr << "unoctstring: " << val << endl);
121
122 char tmp_str[2];
123 tmp_str[0] = static_cast<char>(val);
124 tmp_str[1] = '\0';
125 return string(tmp_str);
126}
127
152string
153id2www(string in, const string &allowable)
154{
155 string::size_type i = 0;
156 DBG(cerr<<"Input string: [" << in << "]" << endl);
157 while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
158 DBG(cerr<<"Found escapee: [" << in[i] << "]");
159 in.replace(i, 1, "%" + hexstring(in[i]));
160 DBGN(cerr<<" now the string is: " << in << endl);
161 i += 3;//i++;
162 }
163
164 return in;
165}
166
177string
178id2www_ce(string in, const string &allowable)
179{
180 return id2www(in, allowable);
181
182
183}
184
219string
220www2id(const string &in, const string &escape, const string &except)
221{
222 string::size_type i = 0;
223 string res = in;
224 while ((i = res.find_first_of(escape, i)) != string::npos) {
225 if (except.find(res.substr(i, 3)) != string::npos) {
226 i += 3;
227 continue;
228 }
229 res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
230 ++i;
231 }
232
233 return res;
234}
235
236static string
237entity(char c)
238{
239 switch (c) {
240 case '>': return "&gt;";
241 case '<': return "&lt;";
242 case '&': return "&amp;";
243 case '\'': return "&apos;";
244 case '\"': return "&quot;";
245 default:
246 throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
247 }
248}
249
250// Assumption: There are always exactly two octal digits in the input
251// and two hex digits in the result.
252string
253octal_to_hex(const string &octal_digits)
254{
255 int val;
256
257 istringstream ss(octal_digits);
258 ss >> oct >> val;
259
260 ostringstream ds;
261 ds << hex << setw(2) << setfill('0') << val;
262 return ds.str();
263}
264
271string
272id2xml(string in, const string &not_allowed)
273{
274 string::size_type i = 0;
275
276 while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
277 in.replace(i, 1, entity(in[i]));
278 ++i;
279 }
280#if 0
281 // Removed the encoding of octal escapes. This function is used by
282 // AttrTable to encode the stuff that is the value of the <value>
283 // element in the DDX. The problem is that some of the values are not
284 // valid UTF-8 and that makes a XML parser gag.; ticket 1512.
285 // jhrg 3/19/10
286
287 // OK, now scan for octal escape sequences like \\012 (where the '\'
288 // is itself escaped). This type of attribute value comes from the netCDF
289 // handler and maybe others. Assumption: The '\' will always appear as
290 // in its escaped form: '\\'. NB: Both backslashes must be escaped in the
291 // C++ string.
292 string octal_escape = "\\\\";
293 i = 0;
294 string::size_type length = in.length();
295 while ((i = in.find(octal_escape, i)) != string::npos) {
296 // Get the three octal digits following the '\\0'
297 string::size_type j = i + 2;
298 if (j + 1 >= length) // Check that we're not past the end
299 break;
300 string octal_digits = in.substr(j, 3);
301 // convert to a &#xdd; XML escape
302 string hex_escape = string("&#x");
303 hex_escape.append(octal_to_hex(octal_digits));
304 hex_escape.append(string(";"));
305
306 // replace the octal escape with an XML/hex escape
307 in.replace(i, 5, hex_escape);
308
309 // increment i
310 i += 6;
311 }
312#endif
313 return in;
314}
315
321string
322xml2id(string in)
323{
324 string::size_type i = 0;
325
326 while ((i = in.find("&gt;", i)) != string::npos)
327 in.replace(i, 4, ">");
328
329 i = 0;
330 while ((i = in.find("&lt;", i)) != string::npos)
331 in.replace(i, 4, "<");
332
333 i = 0;
334 while ((i = in.find("&amp;", i)) != string::npos)
335 in.replace(i, 5, "&");
336
337 i = 0;
338 while ((i = in.find("&apos;", i)) != string::npos)
339 in.replace(i, 6, "'");
340
341 i = 0;
342 while ((i = in.find("&quot;", i)) != string::npos)
343 in.replace(i, 6, "\"");
344
345 return in;
346}
347
353string
355{
356 string::size_type pos;
357 while ((pos = s.find('%')) != string::npos)
358 s.replace(pos, 3, "_");
359
360 return s;
361}
362
363
367string
368escattr(string s)
369{
370 const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"";
371 const string ESC = "\\";
372 const string DOUBLE_ESC = ESC + ESC;
373 const string QUOTE = "\"";
374 const string ESCQUOTE = ESC + QUOTE;
375
376 // escape \ with a second backslash
377 string::size_type ind = 0;
378 while ((ind = s.find(ESC, ind)) != s.npos) {
379 s.replace(ind, 1, DOUBLE_ESC);
380 ind += DOUBLE_ESC.length();
381 }
382
383 // escape non-printing characters with octal escape
384 ind = 0;
385 while ((ind = s.find_first_not_of(printable, ind)) != s.npos)
386 s.replace(ind, 1, ESC + octstring(s[ind]));
387
388 // escape " with backslash
389 ind = 0;
390 while ((ind = s.find(QUOTE, ind)) != s.npos) {
391 s.replace(ind, 1, ESCQUOTE);
392 ind += ESCQUOTE.length();
393 }
394
395 return s;
396}
397
406string
407unescattr(string s)
408{
409 const Regex octal("\\\\[0-3][0-7][0-7]"); // matches 4 characters
410 const Regex esc_quote("\\\\\""); // matches 3 characters
411 const Regex esc_esc("\\\\\\\\"); // matches 2 characters
412 const string ESC = "\\";
413 const string QUOTE = "\"";
414 int matchlen;
415 unsigned int index;
416
417 DBG(cerr << "0XX" << s << "XXX" << endl);
418 // unescape any escaped backslashes
419 index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
420 while (index < s.length()) {
421 DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
422 s.replace(index, 2, ESC);
423 DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
424 index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
425 }
426
427 // unescape any escaped double quote characters
428 index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
429 while (index < s.length()) {
430 s.replace(index, 2, QUOTE);
431 DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
432 index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
433 }
434
435 // unescape octal characters
436 index = octal.search(s.c_str(), s.length(), matchlen, 0);
437 while (index < s.length()) {
438 s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
439 DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
440 index = octal.search(s.c_str(), s.length(), matchlen, 0);
441 }
442
443 DBG(cerr << "4XX" << s << "XXX" << endl);
444 return s;
445}
446
447string
448munge_error_message(string msg)
449{
450 // First, add enclosing quotes if needed.
451 if (*msg.begin() != '"')
452 msg.insert(msg.begin(), '"');
453 if (*(msg.end() - 1) != '"')
454 msg += "\"";
455
456 // Now escape any internal double quotes that aren't escaped.
457 string::iterator miter;
458 for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
459 if (*miter == '"' && *(miter - 1) != '\\')
460 miter = msg.insert(miter, '\\');
461
462 return msg;
463}
464
469string
471{
472 string::size_type idx = 0;
473 while((idx = source.find('\"', idx)) != string::npos) {
474 source.replace(idx, 1, "\\\""); // a backslash and a double quote
475 idx += 2;
476 }
477
478 return source;
479}
480
486string
488{
489 string::size_type idx = 0;
490 while((idx = source.find("\\\"", idx)) != string::npos) {
491 source.replace(idx, 2, "\""); // a backslash and a double quote
492 ++idx;
493 }
494
495 return source;
496}
497
498} // namespace libdap
499
Regular expression matching.
Definition: GNURegex.h:57
int search(const char *s, int len, int &matchlen, int pos=0) const
How much of the string does the pattern match.
Definition: GNURegex.cc:206
top level DAP object to house generic methods
Definition: AlarmHandler.h:36
string esc2underscore(string s)
Definition: escaping.cc:354
string escattr(string s)
Definition: escaping.cc:368
string www2id(const string &in, const string &escape, const string &except)
Definition: escaping.cc:220
string unescape_double_quotes(string source)
Definition: escaping.cc:487
string xml2id(string in)
Definition: escaping.cc:322
string id2xml(string in, const string &not_allowed)
Definition: escaping.cc:272
string unescattr(string s)
Definition: escaping.cc:407
string escape_double_quotes(string source)
Definition: escaping.cc:470
string id2www_ce(string in, const string &allowable)
Definition: escaping.cc:178
string id2www(string in, const string &allowable)
Definition: escaping.cc:153