# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
Contains a class that makes it simple to stream out well-formed and
nicely-indented XML.
"""
# STDLIB
import contextlib
import textwrap
try:
from . import _iterparser
except ImportError:
def xml_escape_cdata(s):
"""
Escapes &, < and > in an XML CDATA string.
"""
s = s.replace("&", "&")
s = s.replace("<", "<")
s = s.replace(">", ">")
return s
def xml_escape(s):
"""
Escapes &, ', ", < and > in an XML attribute value.
"""
s = s.replace("&", "&")
s = s.replace("'", "'")
s = s.replace('"', """)
s = s.replace("<", "<")
s = s.replace(">", ">")
return s
else:
xml_escape_cdata = _iterparser.escape_xml_cdata
xml_escape = _iterparser.escape_xml
[docs]class XMLWriter:
"""
A class to write well-formed and nicely indented XML.
Use like this::
w = XMLWriter(fh)
with w.tag('html'):
with w.tag('body'):
w.data('This is the content')
Which produces::
<html>
<body>
This is the content
</body>
</html>
"""
def __init__(self, file):
"""
Parameters
----------
file : writable file-like
"""
self.write = file.write
if hasattr(file, "flush"):
self.flush = file.flush
self._open = 0 # true if start tag is open
self._tags = []
self._data = []
self._indentation = " " * 64
self.xml_escape_cdata = xml_escape_cdata
self.xml_escape = xml_escape
def _flush(self, indent=True, wrap=False):
"""
Flush internal buffers.
"""
if self._open:
if indent:
self.write(">\n")
else:
self.write(">")
self._open = 0
if self._data:
data = "".join(self._data)
if wrap:
indent = self.get_indentation_spaces(1)
data = textwrap.fill(
data, initial_indent=indent, subsequent_indent=indent
)
self.write("\n")
self.write(self.xml_escape_cdata(data))
self.write("\n")
self.write(self.get_indentation_spaces())
else:
self.write(self.xml_escape_cdata(data))
self._data = []
[docs] def start(self, tag, attrib={}, **extra):
"""
Opens a new element. Attributes can be given as keyword
arguments, or as a string/string dictionary. The method
returns an opaque identifier that can be passed to the
:meth:`close` method, to close all open elements up to and
including this one.
Parameters
----------
tag : str
The element name
attrib : dict of str -> str
Attribute dictionary. Alternatively, attributes can
be given as keyword arguments.
Returns
-------
id : int
Returns an element identifier.
"""
self._flush()
# This is just busy work -- we know our tag names are clean
# tag = xml_escape_cdata(tag)
self._data = []
self._tags.append(tag)
self.write(self.get_indentation_spaces(-1))
self.write(f"<{tag}")
if attrib or extra:
attrib = attrib.copy()
attrib.update(extra)
attrib = list(attrib.items())
attrib.sort()
for k, v in attrib:
if v is not None:
# This is just busy work -- we know our keys are clean
# k = xml_escape_cdata(k)
v = self.xml_escape(v)
self.write(f' {k}="{v}"')
self._open = 1
return len(self._tags)
[docs] @contextlib.contextmanager
def xml_cleaning_method(self, method="escape_xml", **clean_kwargs):
"""Context manager to control how XML data tags are cleaned (escaped) to
remove potentially unsafe characters or constructs.
The default (``method='escape_xml'``) applies brute-force escaping of
certain key XML characters like ``<``, ``>``, and ``&`` to ensure that
the output is not valid XML.
In order to explicitly allow certain XML tags (e.g. link reference or
emphasis tags), use ``method='bleach_clean'``. This sanitizes the data
string using the ``clean`` function of the
`bleach <https://bleach.readthedocs.io/en/latest/clean.html>`_ package.
Any additional keyword arguments will be passed directly to the
``clean`` function.
Finally, use ``method='none'`` to disable any sanitization. This should
be used sparingly.
Example::
w = writer.XMLWriter(ListWriter(lines))
with w.xml_cleaning_method('bleach_clean'):
w.start('td')
w.data('<a href="https://google.com">google.com</a>')
w.end()
Parameters
----------
method : str
Cleaning method. Allowed values are "escape_xml",
"bleach_clean", and "none".
**clean_kwargs : keyword args
Additional keyword args that are passed to the
bleach.clean() function.
"""
current_xml_escape_cdata = self.xml_escape_cdata
if method == "bleach_clean":
# NOTE: bleach is imported locally to avoid importing it when
# it is not nocessary
try:
import bleach
except ImportError:
raise ValueError(
"bleach package is required when HTML escaping is disabled.\n"
'Use "pip install bleach".'
)
if clean_kwargs is None:
clean_kwargs = {}
self.xml_escape_cdata = lambda x: bleach.clean(x, **clean_kwargs)
elif method == "none":
self.xml_escape_cdata = lambda x: x
elif method != "escape_xml":
raise ValueError(
'allowed values of method are "escape_xml", "bleach_clean", and "none"'
)
yield
self.xml_escape_cdata = current_xml_escape_cdata
[docs] @contextlib.contextmanager
def tag(self, tag, attrib={}, **extra):
"""
A convenience method for creating wrapper elements using the
``with`` statement.
Examples
--------
>>> with writer.tag('foo'): # doctest: +SKIP
... writer.element('bar')
... # </foo> is implicitly closed here
...
Parameters are the same as to `start`.
"""
self.start(tag, attrib, **extra)
yield
self.end(tag)
[docs] def data(self, text):
"""
Adds character data to the output stream.
Parameters
----------
text : str
Character data, as a Unicode string.
"""
self._data.append(text)
[docs] def end(self, tag=None, indent=True, wrap=False):
"""
Closes the current element (opened by the most recent call to
`start`).
Parameters
----------
tag : str
Element name. If given, the tag must match the start tag.
If omitted, the current element is closed.
"""
if tag:
if not self._tags:
raise ValueError(f"unbalanced end({tag})")
if tag != self._tags[-1]:
raise ValueError(f"expected end({self._tags[-1]}), got {tag}")
else:
if not self._tags:
raise ValueError("unbalanced end()")
tag = self._tags.pop()
if self._data:
self._flush(indent, wrap)
elif self._open:
self._open = 0
self.write("/>\n")
return
if indent:
self.write(self.get_indentation_spaces())
self.write(f"</{tag}>\n")
[docs] def close(self, id):
"""
Closes open elements, up to (and including) the element identified
by the given identifier.
Parameters
----------
id : int
Element identifier, as returned by the `start` method.
"""
while len(self._tags) > id:
self.end()
[docs] def element(self, tag, text=None, wrap=False, attrib={}, **extra):
"""
Adds an entire element. This is the same as calling `start`,
`data`, and `end` in sequence. The ``text`` argument
can be omitted.
"""
self.start(tag, attrib, **extra)
if text:
self.data(text)
self.end(indent=False, wrap=wrap)
[docs] def flush(self):
pass # replaced by the constructor
[docs] def get_indentation(self):
"""
Returns the number of indentation levels the file is currently
in.
"""
return len(self._tags)
[docs] def get_indentation_spaces(self, offset=0):
"""
Returns a string of spaces that matches the current
indentation level.
"""
return self._indentation[: len(self._tags) + offset]
[docs] @staticmethod
def object_attrs(obj, attrs):
"""
Converts an object with a bunch of attributes on an object
into a dictionary for use by the `XMLWriter`.
Parameters
----------
obj : object
Any Python object
attrs : sequence of str
Attribute names to pull from the object
Returns
-------
attrs : dict
Maps attribute names to the values retrieved from
``obj.attr``. If any of the attributes is `None`, it will
not appear in the output dictionary.
"""
d = {}
for attr in attrs:
if getattr(obj, attr) is not None:
d[attr.replace("_", "-")] = str(getattr(obj, attr))
return d