lxml.html package
Submodules
- lxml.html.ElementSoup module
- lxml.html._diffcommand module
- lxml.html._setmixin module
SetMixin
SetMixin._from_iterable()
SetMixin._hash()
SetMixin.add()
SetMixin.clear()
SetMixin.copy()
SetMixin.difference()
SetMixin.difference_update()
SetMixin.discard()
SetMixin.intersection()
SetMixin.intersection_update()
SetMixin.isdisjoint()
SetMixin.issubset()
SetMixin.issuperset()
SetMixin.pop()
SetMixin.remove()
SetMixin.symmetric_difference()
SetMixin.symmetric_difference_update()
SetMixin.union()
SetMixin.update()
SetMixin._abc_impl
- lxml.html.builder module
CLASS()
FOR()
A
ABBR
ACRONYM
ADDRESS
APPLET
AREA
B
BASE
BASEFONT
BDO
BIG
BLOCKQUOTE
BODY
BR
BUTTON
CAPTION
CENTER
CITE
CODE
COL
COLGROUP
DD
DEL
DFN
DIR
DIV
DL
DT
EM
FIELDSET
FONT
FORM
FRAME
FRAMESET
H1
H2
H3
H4
H5
H6
HEAD
HR
HTML
I
IFRAME
IMG
INPUT
INS
ISINDEX
KBD
LABEL
LEGEND
LI
LINK
MAP
MENU
META
NOFRAMES
NOSCRIPT
OBJECT
OL
OPTGROUP
OPTION
P
PARAM
PRE
Q
S
SAMP
SCRIPT
SELECT
SMALL
SPAN
STRIKE
STRONG
STYLE
SUB
SUP
TABLE
TBODY
TD
TEXTAREA
TFOOT
TH
THEAD
TITLE
TR
TT
U
UL
VAR
- lxml.html.clean module
Cleaner
Cleaner._has_sneaky_javascript()
Cleaner._kill_elements()
Cleaner._remove_javascript_link()
Cleaner._substitute_comments()
Cleaner.allow_element()
Cleaner.allow_embedded_url()
Cleaner.allow_follow()
Cleaner.clean_html()
Cleaner.kill_conditional_comments()
Cleaner._tag_link_attrs
Cleaner.add_nofollow
Cleaner.allow_tags
Cleaner.annoying_tags
Cleaner.comments
Cleaner.embedded
Cleaner.forms
Cleaner.frames
Cleaner.host_whitelist
Cleaner.inline_style
Cleaner.javascript
Cleaner.kill_tags
Cleaner.links
Cleaner.meta
Cleaner.page_structure
Cleaner.processing_instructions
Cleaner.remove_tags
Cleaner.remove_unknown_tags
Cleaner.safe_attrs
Cleaner.safe_attrs_only
Cleaner.scripts
Cleaner.style
Cleaner.whitelist_tags
_break_text()
_find_image_dataurls()
_has_javascript_scheme()
_insert_break()
_is_unsafe_image_type()
_link_text()
_looks_like_tag_content()
_possibly_malicious_schemes()
_replace_css_import()
_replace_css_javascript()
_substitute_whitespace()
autolink()
autolink_html()
clean_html()
word_break()
word_break_html()
- lxml.html.defs module
- lxml.html.diff module
NoDeletes
DEL_END
DEL_START
InsensitiveSequenceMatcher
InsensitiveSequenceMatcher.find_longest_match()
InsensitiveSequenceMatcher.get_grouped_opcodes()
InsensitiveSequenceMatcher.get_matching_blocks()
InsensitiveSequenceMatcher.get_opcodes()
InsensitiveSequenceMatcher.quick_ratio()
InsensitiveSequenceMatcher.ratio()
InsensitiveSequenceMatcher.real_quick_ratio()
InsensitiveSequenceMatcher.set_seq1()
InsensitiveSequenceMatcher.set_seq2()
InsensitiveSequenceMatcher.set_seqs()
InsensitiveSequenceMatcher.threshold
href_token
href_token.capitalize()
href_token.casefold()
href_token.center()
href_token.count()
href_token.encode()
href_token.endswith()
href_token.expandtabs()
href_token.find()
href_token.format()
href_token.format_map()
href_token.html()
href_token.index()
href_token.isalnum()
href_token.isalpha()
href_token.isascii()
href_token.isdecimal()
href_token.isdigit()
href_token.isidentifier()
href_token.islower()
href_token.isnumeric()
href_token.isprintable()
href_token.isspace()
href_token.istitle()
href_token.isupper()
href_token.join()
href_token.ljust()
href_token.lower()
href_token.lstrip()
href_token.maketrans()
href_token.partition()
href_token.removeprefix()
href_token.removesuffix()
href_token.replace()
href_token.rfind()
href_token.rindex()
href_token.rjust()
href_token.rpartition()
href_token.rsplit()
href_token.rstrip()
href_token.split()
href_token.splitlines()
href_token.startswith()
href_token.strip()
href_token.swapcase()
href_token.title()
href_token.translate()
href_token.upper()
href_token.zfill()
href_token.hide_when_equal
tag_token
tag_token.capitalize()
tag_token.casefold()
tag_token.center()
tag_token.count()
tag_token.encode()
tag_token.endswith()
tag_token.expandtabs()
tag_token.find()
tag_token.format()
tag_token.format_map()
tag_token.html()
tag_token.index()
tag_token.isalnum()
tag_token.isalpha()
tag_token.isascii()
tag_token.isdecimal()
tag_token.isdigit()
tag_token.isidentifier()
tag_token.islower()
tag_token.isnumeric()
tag_token.isprintable()
tag_token.isspace()
tag_token.istitle()
tag_token.isupper()
tag_token.join()
tag_token.ljust()
tag_token.lower()
tag_token.lstrip()
tag_token.maketrans()
tag_token.partition()
tag_token.removeprefix()
tag_token.removesuffix()
tag_token.replace()
tag_token.rfind()
tag_token.rindex()
tag_token.rjust()
tag_token.rpartition()
tag_token.rsplit()
tag_token.rstrip()
tag_token.split()
tag_token.splitlines()
tag_token.startswith()
tag_token.strip()
tag_token.swapcase()
tag_token.title()
tag_token.translate()
tag_token.upper()
tag_token.zfill()
tag_token.hide_when_equal
token
token.capitalize()
token.casefold()
token.center()
token.count()
token.encode()
token.endswith()
token.expandtabs()
token.find()
token.format()
token.format_map()
token.html()
token.index()
token.isalnum()
token.isalpha()
token.isascii()
token.isdecimal()
token.isdigit()
token.isidentifier()
token.islower()
token.isnumeric()
token.isprintable()
token.isspace()
token.istitle()
token.isupper()
token.join()
token.ljust()
token.lower()
token.lstrip()
token.maketrans()
token.partition()
token.removeprefix()
token.removesuffix()
token.replace()
token.rfind()
token.rindex()
token.rjust()
token.rpartition()
token.rsplit()
token.rstrip()
token.split()
token.splitlines()
token.startswith()
token.strip()
token.swapcase()
token.title()
token.translate()
token.upper()
token.zfill()
token.hide_when_equal
_contains_block_level_tag()
_fixup_ins_del_tags()
_merge_element_contents()
_move_el_inside_block()
cleanup_delete()
cleanup_html()
compress_merge_back()
compress_tokens()
copy_annotations()
default_markup()
end_tag()
expand_tokens()
fixup_chunks()
fixup_ins_del_tags()
flatten_el()
html_annotate()
html_annotate_merge_annotations()
htmldiff()
htmldiff_tokens()
is_end_tag()
is_start_tag()
is_word()
locate_unbalanced_end()
locate_unbalanced_start()
markup_serialize_tokens()
merge_delete()
merge_insert()
parse_html()
serialize_html_fragment()
split_delete()
split_trailing_whitespace()
split_unbalanced()
split_words()
start_tag()
tokenize()
tokenize_annotated()
- lxml.html.formfill module
- lxml.html.html5parser module
- lxml.html.soupparser module
Module contents
The lxml.html
tool set for HTML handling.
- class lxml.html.CheckboxGroup(iterable=(), /)[source]
Bases:
list
Represents a group of checkboxes (
<input type=checkbox>
) that have the same name.In addition to using this like a list, the
.value
attribute returns a set-like object that you can add to or remove from to check and uncheck checkboxes. You can also use.value_options
to get the possible values.- append(object, /)
Append object to the end of the list.
- clear()
Remove all items from list.
- copy()
Return a shallow copy of the list.
- count(value, /)
Return number of occurrences of value.
- extend(iterable, /)
Extend list by appending elements from the iterable.
- index(value, start=0, stop=9223372036854775807, /)
Return first index of value.
Raises ValueError if the value is not present.
- insert(index, object, /)
Insert object before index.
- pop(index=-1, /)
Remove and return item at index (default last).
Raises IndexError if list is empty or index is out of range.
- remove(value, /)
Remove first occurrence of value.
Raises ValueError if the value is not present.
- reverse()
Reverse IN PLACE.
- sort(*, key=None, reverse=False)
Sort the list in ascending order and return None.
The sort is in-place (i.e. the list itself is modified) and stable (i.e. the order of two equal elements is maintained).
If a key function is given, apply it once to each list item and sort them, ascending or descending, according to their function values.
The reverse flag can be set to sort in descending order.
- property value
Return a set-like object that can be modified to check or uncheck individual checkboxes according to their value.
- property value_options
Returns a list of all the possible values.
- class lxml.html.CheckboxValues(group)[source]
Bases:
SetMixin
Represents the values of the checked checkboxes in a group of checkboxes with the same name.
- classmethod _from_iterable(it)
Construct an instance of the class from any iterable input.
Must override this method if the class constructor signature does not accept an iterable for an input.
- _hash()
Compute the hash value of a set.
Note that we don’t define __hash__: not all sets are hashable. But if you define a hashable set type, its __hash__ should call this function.
This must be compatible __eq__.
All sets ought to compare equal if they contain the same elements, regardless of how they are implemented, and regardless of the order of the elements; so there’s not much freedom for __eq__ or __hash__. We match the algorithm used by the built-in frozenset type.
- clear()
This is slow (creates N new iterators!) but effective.
- copy()
- difference(other)
- difference_update(other)
- discard(item)
Remove an element. Do not raise an exception if absent.
- intersection(other)
- intersection_update(other)
- isdisjoint(other)
Return True if two sets have a null intersection.
- issubset(other)
Return self<=value.
- issuperset(other)
Return self>=value.
- pop()
Return the popped value. Raise KeyError if empty.
- symmetric_difference(other)
- symmetric_difference_update(other)
- union(other)
Return self|value.
- update(other)
- _abc_impl = <_abc._abc_data object>
- class lxml.html.Classes(attributes)[source]
Bases:
MutableSet
Provides access to an element’s class attribute as a set-like collection. Usage:
>>> el = fromstring('<p class="hidden large">Text</p>') >>> classes = el.classes # or: classes = Classes(el.attrib) >>> classes |= ['block', 'paragraph'] >>> el.get('class') 'hidden large block paragraph' >>> classes.toggle('hidden') False >>> el.get('class') 'large block paragraph' >>> classes -= ('some', 'classes', 'block') >>> el.get('class') 'large paragraph'
- classmethod _from_iterable(it)
Construct an instance of the class from any iterable input.
Must override this method if the class constructor signature does not accept an iterable for an input.
- _hash()
Compute the hash value of a set.
Note that we don’t define __hash__: not all sets are hashable. But if you define a hashable set type, its __hash__ should call this function.
This must be compatible __eq__.
All sets ought to compare equal if they contain the same elements, regardless of how they are implemented, and regardless of the order of the elements; so there’s not much freedom for __eq__ or __hash__. We match the algorithm used by the built-in frozenset type.
- clear()
This is slow (creates N new iterators!) but effective.
- discard(value)[source]
Remove a class if it is currently present.
If the class is not present, do nothing.
- isdisjoint(other)
Return True if two sets have a null intersection.
- pop()
Return the popped value. Raise KeyError if empty.
- remove(value)[source]
Remove a class; it must currently be present.
If the class is not present, raise a KeyError.
- toggle(value)[source]
Add a class name if it isn’t there yet, or remove it if it exists.
Returns true if the class was added (and is now enabled) and false if it was removed (and is now disabled).
- _abc_impl = <_abc._abc_data object>
- class lxml.html.FieldsDict(inputs)[source]
Bases:
MutableMapping
- clear() None. Remove all items from D.
- get(k[, d]) D[k] if k in D, else d. d defaults to None.
- items() a set-like object providing a view on D's items
- pop(k[, d]) v, remove specified key and return the corresponding value.
If key is not found, d is returned if given, otherwise KeyError is raised.
- popitem() (k, v), remove and return some (key, value) pair
as a 2-tuple; but raise KeyError if D is empty.
- setdefault(k[, d]) D.get(k,d), also set D[k]=d if k not in D
- update([E, ]**F) None. Update D from mapping/iterable E and F.
If E present and has a .keys() method, does: for k in E: D[k] = E[k] If E present and lacks .keys() method, does: for (k, v) in E: D[k] = v In either case, this is followed by: for k, v in F.items(): D[k] = v
- values() an object providing a view on D's values
- _abc_impl = <_abc._abc_data object>
- class lxml.html.FormElement[source]
Bases:
HtmlElement
Represents a <form> element.
- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, element)
Adds a subelement to the end of this element.
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- form_values()[source]
Return a list of tuples of the field values for the form. This is suitable to be passed to
urllib.urlencode()
.
- get(self, key, default=None)
Gets an element attribute.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, element)
Inserts a subelement at the given position in this element
- items(self)
Gets element attributes, as a sequence. The attributes are returned in an arbitrary order.
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
Gets a list of attribute names. The names are returned in an arbitrary order (just like for an ordinary Python dictionary).
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
Gets element attribute values as a sequence of strings. The attributes are returned in an arbitrary order.
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- property action
Get/set the form’s
action
attribute.
- attrib
Element attribute dictionary. Where possible, use get(), set(), keys(), values() and items() to access element attributes.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property fields
Dictionary-like object that represents all the fields in this form. You can set values in this dictionary to effect the form.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property inputs
Returns an accessor for all the input elements in the form.
See InputGetter for more information about the object.
- property label
Get or set any <label> element associated with this element.
- property method
Get/set the form’s method. Always returns a capitalized string, and defaults to
'GET'
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
Element tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
Text before the first subelement. This is either a string or the value None, if there was no text.
- class lxml.html.HTMLParser(**kwargs)[source]
Bases:
HTMLParser
An HTML parser that is configured to return lxml.html Element objects.
- close(self)
Terminates feeding data to this parser. This tells the parser to process any remaining data in the feed buffer, and then returns the root Element of the tree that was parsed.
This method must be called after passing the last chunk of data into the
feed()
method. It should only be called when using the feed parser interface, all other usage is undefined.
- copy(self)
Create a new parser with the same configuration.
- feed(self, data)
Feeds data to the parser. The argument should be an 8-bit string buffer containing encoded data, although Unicode is supported as long as both string types are not mixed.
This is the main entry point to the consumer interface of a parser. The parser will parse as much of the XML stream as it can on each call. To finish parsing or to reset the parser, call the
close()
method. Both methods may raise ParseError if errors occur in the input data. If an error is raised, there is no longer a need to callclose()
.The feed parser interface is independent of the normal parser usage. You can use the same parser as a feed parser and in the
parse()
function concurrently.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with this parser.
- setElementClassLookup(lookup)
- Deprecated
use
parser.set_element_class_lookup(lookup)
instead.
- set_element_class_lookup(self, lookup=None)
Set a lookup scheme for element classes generated from this parser.
Reset it by passing None or nothing.
- error_log
The error log of the last parser run.
- feed_error_log
The error log of the last (or current) run of the feed parser.
Note that this is local to the feed parser and thus is different from what the
error_log
property returns.
- resolvers
The custom resolver registry of this parser.
- target
- version
The version of the underlying XML parser.
- class lxml.html.HtmlComment[source]
Bases:
HtmlMixin
,CommentBase
- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, value)
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, value)
- items(self)
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
- class lxml.html.HtmlElement[source]
Bases:
HtmlMixin
,ElementBase
- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, element)
Adds a subelement to the end of this element.
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
Gets an element attribute.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, element)
Inserts a subelement at the given position in this element
- items(self)
Gets element attributes, as a sequence. The attributes are returned in an arbitrary order.
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
Gets a list of attribute names. The names are returned in an arbitrary order (just like for an ordinary Python dictionary).
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
Gets element attribute values as a sequence of strings. The attributes are returned in an arbitrary order.
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
Element attribute dictionary. Where possible, use get(), set(), keys(), values() and items() to access element attributes.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
Element tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
Text before the first subelement. This is either a string or the value None, if there was no text.
- class lxml.html.HtmlElementClassLookup(classes=None, mixins=None)[source]
Bases:
CustomElementClassLookup
A lookup scheme for HTML Element classes.
To create a lookup instance with different Element classes, pass a tag name mapping of Element classes in the
classes
keyword argument and/or a tag name mapping of Mixin classes in themixins
keyword argument. The special key ‘*’ denotes a Mixin class that should be mixed into all Element classes.- set_fallback(self, lookup)
Sets the fallback scheme for this lookup method.
- _default_element_classes = {'form': <class 'lxml.html.FormElement'>, 'input': <class 'lxml.html.InputElement'>, 'label': <class 'lxml.html.LabelElement'>, 'select': <class 'lxml.html.SelectElement'>, 'textarea': <class 'lxml.html.TextareaElement'>}
- fallback
- class lxml.html.HtmlEntity[source]
Bases:
HtmlMixin
,EntityBase
- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, value)
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, value)
- items(self)
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- name
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
- class lxml.html.HtmlMixin[source]
Bases:
object
- cssselect(expr, translator='html')[source]
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()[source]
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()[source]
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- find_rel_links(rel)[source]
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- get_element_by_id(id, *default)[source]
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- iterlinks()[source]
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)[source]
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- resolve_base_href(handle_failures=None)[source]
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)[source]
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)[source]
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- class lxml.html.HtmlProcessingInstruction[source]
-
- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, value)
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
Try to parse pseudo-attributes from the text content of the processing instruction, search for one with the given key as name and return its associated value.
Note that this is only a convenience method for the most common case that all text content is structured in attribute-like name-value pairs with properly quoted values. It is not guaranteed to work for all possible text content.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, value)
- items(self)
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
Returns a dict containing all pseudo-attributes that can be parsed from the text content of this processing instruction. Note that modifying the dict currently has no effect on the XML node, although this is not guaranteed to stay this way.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- target
- text
- class lxml.html.InputElement[source]
Bases:
InputMixin
,HtmlElement
Represents an
<input>
element.You can get the type with
.type
(which is lower-cased and defaults to'text'
).Also you can get and set the value with
.value
Checkboxes and radios have the attribute
input.checkable == True
(for all others it is false) and a boolean attribute.checked
.- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, element)
Adds a subelement to the end of this element.
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
Gets an element attribute.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, element)
Inserts a subelement at the given position in this element
- items(self)
Gets element attributes, as a sequence. The attributes are returned in an arbitrary order.
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
Gets a list of attribute names. The names are returned in an arbitrary order (just like for an ordinary Python dictionary).
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
Gets element attribute values as a sequence of strings. The attributes are returned in an arbitrary order.
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
Element attribute dictionary. Where possible, use get(), set(), keys(), values() and items() to access element attributes.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property checkable
Boolean: can this element be checked?
- property checked
Boolean attribute to get/set the presence of the
checked
attribute.You can only use this on checkable input types.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- property name
Get/set the name of the element
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
Element tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
Text before the first subelement. This is either a string or the value None, if there was no text.
- property type
Return the type of this element (using the type attribute).
- property value
Get/set the value of this element, using the
value
attribute.Also, if this is a checkbox and it has no value, this defaults to
'on'
. If it is a checkbox or radio that is not checked, this returns None.
- class lxml.html.InputGetter(form)[source]
Bases:
object
An accessor that represents all the input fields in a form.
You can get fields by name from this, with
form.inputs['field_name']
. If there are a set of checkboxes with the same name, they are returned as a list (a CheckboxGroup which also allows value setting). Radio inputs are handled similarly. Use.keys()
and.items()
to process all fields in this way.You can also iterate over this to get all input elements. This won’t return the same thing as if you get all the names, as checkboxes and radio elements are returned individually.
- class lxml.html.InputMixin[source]
Bases:
object
Mix-in for all input elements (input, select, and textarea)
- property name
Get/set the name of the element
- class lxml.html.LabelElement[source]
Bases:
HtmlElement
Represents a
<label>
element.Label elements are linked to other elements with their
for
attribute. You can access this element withlabel.for_element
.- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, element)
Adds a subelement to the end of this element.
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
Gets an element attribute.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, element)
Inserts a subelement at the given position in this element
- items(self)
Gets element attributes, as a sequence. The attributes are returned in an arbitrary order.
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
Gets a list of attribute names. The names are returned in an arbitrary order (just like for an ordinary Python dictionary).
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
Gets element attribute values as a sequence of strings. The attributes are returned in an arbitrary order.
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
Element attribute dictionary. Where possible, use get(), set(), keys(), values() and items() to access element attributes.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property for_element
Get/set the element this label points to. Return None if it can’t be found.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
Element tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
Text before the first subelement. This is either a string or the value None, if there was no text.
- class lxml.html.MultipleSelectOptions(select)[source]
Bases:
SetMixin
Represents all the selected options in a
<select multiple>
element.You can add to this set-like option to select an option, or remove to unselect the option.
- classmethod _from_iterable(it)
Construct an instance of the class from any iterable input.
Must override this method if the class constructor signature does not accept an iterable for an input.
- _hash()
Compute the hash value of a set.
Note that we don’t define __hash__: not all sets are hashable. But if you define a hashable set type, its __hash__ should call this function.
This must be compatible __eq__.
All sets ought to compare equal if they contain the same elements, regardless of how they are implemented, and regardless of the order of the elements; so there’s not much freedom for __eq__ or __hash__. We match the algorithm used by the built-in frozenset type.
- clear()
This is slow (creates N new iterators!) but effective.
- copy()
- difference(other)
- difference_update(other)
- discard(item)
Remove an element. Do not raise an exception if absent.
- intersection(other)
- intersection_update(other)
- isdisjoint(other)
Return True if two sets have a null intersection.
- issubset(other)
Return self<=value.
- issuperset(other)
Return self>=value.
- pop()
Return the popped value. Raise KeyError if empty.
- symmetric_difference(other)
- symmetric_difference_update(other)
- union(other)
Return self|value.
- update(other)
- _abc_impl = <_abc._abc_data object>
- property options
Iterator of all the
<option>
elements.
- class lxml.html.RadioGroup(iterable=(), /)[source]
Bases:
list
This object represents several
<input type=radio>
elements that have the same name.You can use this like a list, but also use the property
.value
to check/uncheck inputs. Also you can use.value_options
to get the possible values.- append(object, /)
Append object to the end of the list.
- clear()
Remove all items from list.
- copy()
Return a shallow copy of the list.
- count(value, /)
Return number of occurrences of value.
- extend(iterable, /)
Extend list by appending elements from the iterable.
- index(value, start=0, stop=9223372036854775807, /)
Return first index of value.
Raises ValueError if the value is not present.
- insert(index, object, /)
Insert object before index.
- pop(index=-1, /)
Remove and return item at index (default last).
Raises IndexError if list is empty or index is out of range.
- remove(value, /)
Remove first occurrence of value.
Raises ValueError if the value is not present.
- reverse()
Reverse IN PLACE.
- sort(*, key=None, reverse=False)
Sort the list in ascending order and return None.
The sort is in-place (i.e. the list itself is modified) and stable (i.e. the order of two equal elements is maintained).
If a key function is given, apply it once to each list item and sort them, ascending or descending, according to their function values.
The reverse flag can be set to sort in descending order.
- property value
Get/set the value, which checks the radio with that value (and unchecks any other value).
- property value_options
Returns a list of all the possible values.
- class lxml.html.SelectElement[source]
Bases:
InputMixin
,HtmlElement
<select>
element. You can get the name with.name
..value
will be the value of the selected option, unless this is a multi-select element (<select multiple>
), in which case it will be a set-like object. In either case.value_options
gives the possible values.The boolean attribute
.multiple
shows if this is a multi-select.- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, element)
Adds a subelement to the end of this element.
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
Gets an element attribute.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, element)
Inserts a subelement at the given position in this element
- items(self)
Gets element attributes, as a sequence. The attributes are returned in an arbitrary order.
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
Gets a list of attribute names. The names are returned in an arbitrary order (just like for an ordinary Python dictionary).
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
Gets element attribute values as a sequence of strings. The attributes are returned in an arbitrary order.
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
Element attribute dictionary. Where possible, use get(), set(), keys(), values() and items() to access element attributes.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- property multiple
Boolean attribute: is there a
multiple
attribute on this element.
- property name
Get/set the name of the element
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
Element tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
Text before the first subelement. This is either a string or the value None, if there was no text.
- property value
Get/set the value of this select (the selected option).
If this is a multi-select, this is a set-like object that represents all the selected options.
- property value_options
All the possible values this select can have (the
value
attribute of all the<option>
elements.
- class lxml.html.TextareaElement[source]
Bases:
InputMixin
,HtmlElement
<textarea>
element. You can get the name with.name
and get/set the value with.value
- _init(self)
Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses.
- addnext(self, element)
Adds the element as a following sibling directly after this element.
This is normally used to set a processing instruction or comment after the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- addprevious(self, element)
Adds the element as a preceding sibling directly before this element.
This is normally used to set a processing instruction or comment before the root node of a document. Note that tail text is automatically discarded when adding at the root level.
- append(self, element)
Adds a subelement to the end of this element.
- clear(self, keep_tail=False)
Resets an element. This function removes all subelements, clears all attributes and sets the text and tail properties to None.
Pass
keep_tail=True
to leave the tail text untouched.
- cssselect(expr, translator='html')
Run the CSS expression on this element and its children, returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr, translator=’html’)(self) – note that pre-compiling the expression can provide a substantial speedup.
- drop_tag()
Remove the tag, but not its children or text. The children and text are merged into the parent.
Example:
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding='unicode')) <div>Hello World!</div>
- drop_tree()
Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent.
- extend(self, elements)
Extends the current children by the elements in the iterable.
- find(self, path, namespaces=None)
Finds the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- find_class(class_name)
Find any elements with the given class name.
- find_rel_links(rel)
Find any links like
<a rel="{rel}">...</a>
; returns a list of elements.
- findall(self, path, namespaces=None)
Finds all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- findtext(self, path, default=None, namespaces=None)
Finds text for the first matching subelement, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- get(self, key, default=None)
Gets an element attribute.
- get_element_by_id(id, *default)
Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise.
Note that there can be more than one element with the same id, and this isn’t uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same.
- getchildren(self)
Returns all direct children. The elements are returned in document order.
- Deprecated
Note that this method has been deprecated as of ElementTree 1.3 and lxml 2.0. New code should use
list(element)
or simply iterate over elements.
- getiterator(self, tag=None, *tags)
Returns a sequence or iterator of all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags, see iter.
- Deprecated
Note that this method is deprecated as of ElementTree 1.3 and lxml 2.0. It returns an iterator in lxml, which diverges from the original ElementTree behaviour. If you want an efficient iterator, use the
element.iter()
method instead. You should only use this method in new code if you require backwards compatibility with older versions of lxml or ElementTree.
- getnext(self)
Returns the following sibling of this element or None.
- getparent(self)
Returns the parent of this element or None for the root element.
- getprevious(self)
Returns the preceding sibling of this element or None.
- getroottree(self)
Return an ElementTree for the root node of the document that contains this element.
This is the same as following element.getparent() up the tree until it returns None (for the root element) and then build an ElementTree for the last parent that was returned.
- index(self, child, start=None, stop=None)
Find the position of the child within the parent.
This method is not part of the original ElementTree API.
- insert(self, index, element)
Inserts a subelement at the given position in this element
- items(self)
Gets element attributes, as a sequence. The attributes are returned in an arbitrary order.
- iter(self, tag=None, *tags)
Iterate over all elements in the subtree in document order (depth first pre-order), starting with this element.
Can be restricted to find only elements with specific tags: pass
"{ns}localname"
as tag. Either or both ofns
andlocalname
can be*
for a wildcard;ns
can be empty for no namespace."localname"
is equivalent to"{}localname"
(i.e. no namespace) but"*"
is"{*}*"
(any or no namespace), not"{}*"
.You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type.
Passing multiple tags (or a sequence of tags) instead of a single tag will let the iterator return all elements matching any of these tags, in document order.
- iterancestors(self, tag=None, *tags)
Iterate over the ancestors of this element (from parent to parent).
Can be restricted to find only elements with specific tags, see iter.
- iterchildren(self, tag=None, *tags, reversed=False)
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned elements can be reversed with the ‘reversed’ keyword and restricted to find only elements with specific tags, see iter.
- iterdescendants(self, tag=None, *tags)
Iterate over the descendants of this element in document order.
As opposed to
el.iter()
, this iterator does not yield the element itself. The returned elements can be restricted to find only elements with specific tags, see iter.
- iterfind(self, path, namespaces=None)
Iterates over all matching subelements, by tag name or path.
The optional
namespaces
argument accepts a prefix-to-namespace mapping that allows the usage of XPath prefixes in the path expression.
- iterlinks()
Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text).
pos
is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags.Note: <base href> is not taken into account in any way. The link you get is exactly the link in the document.
Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on.
- itersiblings(self, tag=None, *tags, preceding=False)
Iterate over the following or preceding siblings of this element.
The direction is determined by the ‘preceding’ keyword which defaults to False, i.e. forward iteration over the following siblings. When True, the iterator yields the preceding siblings in reverse document order, i.e. starting right before the current element and going backwards.
Can be restricted to find only elements with specific tags, see iter.
- itertext(self, tag=None, *tags, with_tail=True)
Iterates over the text content of a subtree.
You can pass tag names to restrict text content to specific elements, see iter.
You can set the
with_tail
keyword argument toFalse
to skip over tail text.
- keys(self)
Gets a list of attribute names. The names are returned in an arbitrary order (just like for an ordinary Python dictionary).
- make_links_absolute(base_url=None, resolve_base_href=True, handle_failures=None)
Make all links in the document absolute, given the
base_url
for the document (the full URL where the document came from), or if nobase_url
is given, then the.base_url
of the document.If
resolve_base_href
is true, then any<base href>
tags in the document are used and removed from the document. If it is false then any such tag is ignored.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with the same document.
- remove(self, element)
Removes a matching subelement. Unlike the find methods, this method compares elements based on identity, not on tag value or contents.
- replace(self, old_element, new_element)
Replaces a subelement with the element passed as second argument.
- resolve_base_href(handle_failures=None)
Find any
<base href>
tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied.If
handle_failures
is None (default), a failure to process a URL will abort the processing. If set to ‘ignore’, errors are ignored. If set to ‘discard’, failing URLs will be removed.
- rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)
Rewrite all the links in the document. For each link
link_repl_func(link)
will be called, and the return value will replace the old link.Note that links may not be absolute (unless you first called
make_links_absolute()
), and may be internal (e.g.,'#anchor'
). They can also be values like'mailto:email'
or'javascript:expr'
.If you give
base_href
then all links passed tolink_repl_func()
will take that into account.If the
link_repl_func
returns None, the attribute or tag text will be removed completely.
- set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None, creates a ‘boolean’ attribute without value, e.g. “<form novalidate></form>” for
form.set('novalidate')
.
- text_content()
Return the text content of the tag (and the text in any children).
- values(self)
Gets element attribute values as a sequence of strings. The attributes are returned in an arbitrary order.
- xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
Evaluate an xpath expression using the element as context node.
- attrib
Element attribute dictionary. Where possible, use get(), set(), keys(), values() and items() to access element attributes.
- base
The base URI of the Element (xml:base or HTML base URL). None if the base URI is unknown.
Note that the value depends on the URL of the document that holds the Element if there is no xml:base attribute on the Element or its ancestors.
Setting this property will set an xml:base attribute on the Element, regardless of the document type (XML or HTML).
- property base_url
Returns the base URL, given when the page was parsed.
Use with
urlparse.urljoin(el.base_url, href)
to get absolute URLs.
- property body
Return the <body> element. Can be called from a child element to get the document’s head.
- property classes
A set-like wrapper around the ‘class’ attribute.
- property forms
Return a list of all the forms
- property head
Returns the <head> element. Can be called from a child element to get the document’s head.
- property label
Get or set any <label> element associated with this element.
- property name
Get/set the name of the element
- nsmap
Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.
Note that changing the returned dict has no effect on the Element.
- prefix
Namespace prefix or None.
- sourceline
Original line number as found by the parser or None if unknown.
- tag
Element tag
- tail
Text after this element’s end tag, but before the next sibling element’s start tag. This is either a string or the value None, if there was no text.
- text
Text before the first subelement. This is either a string or the value None, if there was no text.
- property value
Get/set the value (which is the contents of this element)
- class lxml.html.XHTMLParser(**kwargs)[source]
Bases:
XMLParser
An XML parser that is configured to return lxml.html Element objects.
Note that this parser is not really XHTML aware unless you let it load a DTD that declares the HTML entities. To do this, make sure you have the XHTML DTDs installed in your catalogs, and create the parser like this:
>>> parser = XHTMLParser(load_dtd=True)
If you additionally want to validate the document, use this:
>>> parser = XHTMLParser(dtd_validation=True)
For catalog support, see http://www.xmlsoft.org/catalog.html.
- close(self)
Terminates feeding data to this parser. This tells the parser to process any remaining data in the feed buffer, and then returns the root Element of the tree that was parsed.
This method must be called after passing the last chunk of data into the
feed()
method. It should only be called when using the feed parser interface, all other usage is undefined.
- copy(self)
Create a new parser with the same configuration.
- feed(self, data)
Feeds data to the parser. The argument should be an 8-bit string buffer containing encoded data, although Unicode is supported as long as both string types are not mixed.
This is the main entry point to the consumer interface of a parser. The parser will parse as much of the XML stream as it can on each call. To finish parsing or to reset the parser, call the
close()
method. Both methods may raise ParseError if errors occur in the input data. If an error is raised, there is no longer a need to callclose()
.The feed parser interface is independent of the normal parser usage. You can use the same parser as a feed parser and in the
parse()
function concurrently.
- makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
Creates a new element associated with this parser.
- setElementClassLookup(lookup)
- Deprecated
use
parser.set_element_class_lookup(lookup)
instead.
- set_element_class_lookup(self, lookup=None)
Set a lookup scheme for element classes generated from this parser.
Reset it by passing None or nothing.
- error_log
The error log of the last parser run.
- feed_error_log
The error log of the last (or current) run of the feed parser.
Note that this is local to the feed parser and thus is different from what the
error_log
property returns.
- resolvers
The custom resolver registry of this parser.
- target
- version
The version of the underlying XML parser.
- class lxml.html._MethodFunc(name, copy=False, source_class=<class 'lxml.html.HtmlMixin'>)[source]
Bases:
object
An object that represents a method on an element as a function; the function takes either an element or an HTML string. It returns whatever the function normally returns, or if the function works in-place (and so returns None) it returns a serialized form of the resulting document.
- lxml.html.Element(*args, **kw)[source]
Create a new HTML Element.
This can also be used for XHTML documents.
- lxml.html.__bytes_replace_meta_content_type(repl, string, count=0)
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
- lxml.html.__str_replace_meta_content_type(repl, string, count=0)
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
- lxml.html._iter_css_imports(string, pos=0, endpos=9223372036854775807)
Return an iterator over all non-overlapping matches for the RE pattern in string.
For each match, the iterator returns a match object.
- lxml.html._iter_css_urls(string, pos=0, endpos=9223372036854775807)
Return an iterator over all non-overlapping matches for the RE pattern in string.
For each match, the iterator returns a match object.
- lxml.html._looks_like_full_html_bytes(string, pos=0, endpos=9223372036854775807)
Matches zero or more characters at the beginning of the string.
- lxml.html._looks_like_full_html_unicode(string, pos=0, endpos=9223372036854775807)
Matches zero or more characters at the beginning of the string.
- lxml.html._parse_meta_refresh_url(string, pos=0, endpos=9223372036854775807)
Scan through string looking for a match, and return a corresponding match object instance.
Return None if no position in the string matches.
- lxml.html.fragment_fromstring(html, create_parent=False, base_url=None, parser=None, **kw)[source]
Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element.
If
create_parent
is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is also allowed, as are multiple elements as result of the parsing.Passing a
base_url
will set the document’sbase_url
attribute (and the tree’s docinfo.URL).
- lxml.html.fragments_fromstring(html, no_leading_text=False, base_url=None, parser=None, **kw)[source]
Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true, then it will be an error if there is leading text, and it will always be a list of only elements.
base_url will set the document’s base_url attribute (and the tree’s docinfo.URL).
- lxml.html.fromstring(html, base_url=None, parser=None, **kw)[source]
Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it is a fragment or a document.
base_url will set the document’s base_url attribute (and the tree’s docinfo.URL)
- lxml.html.html_to_xhtml(html)[source]
Convert all tags in an HTML tree to XHTML by moving them to the XHTML namespace.
- lxml.html.open_in_browser(doc, encoding=None)[source]
Open the HTML document in a web browser, saving it to a temporary file to open it. Note that this does not delete the file after use. This is mainly meant for debugging.
- lxml.html.parse(filename_or_url, parser=None, base_url=None, **kw)[source]
Parse a filename, URL, or file-like object into an HTML document tree. Note: this returns a tree, not an element. Use
parse(...).getroot()
to get the document root.You can override the base URL with the
base_url
keyword. This is most useful when parsing from a file-like object.
- lxml.html.submit_form(form, extra_values=None, open_http=None)[source]
Helper function to submit a form. Returns a file-like object, as from
urllib.urlopen()
. This object also has a.geturl()
function, which shows the URL if there were any redirects.You can use this like:
form = doc.forms[0] form.inputs['foo'].value = 'bar' # etc response = form.submit() doc = parse(response) doc.make_links_absolute(response.geturl())
To change the HTTP requester, pass a function as
open_http
keyword argument that opens the URL for you. The function must have the following signature:open_http(method, URL, values)
The action is one of ‘GET’ or ‘POST’, the URL is the target URL as a string, and the values are a sequence of
(name, value)
tuples with the form data.
- lxml.html.tostring(doc, pretty_print=False, include_meta_content_type=False, encoding=None, method='html', with_tail=True, doctype=None)[source]
Return an HTML string representation of the document.
Note: if include_meta_content_type is true this will create a
<meta http-equiv="Content-Type" ...>
tag in the head; regardless of the value of include_meta_content_type any existing<meta http-equiv="Content-Type" ...>
tag will be removedThe
encoding
argument controls the output encoding (defaults to ASCII, with &#…; character references for any characters outside of ASCII). Note that you can pass the name'unicode'
asencoding
argument to serialise to a Unicode string.The
method
argument defines the output method. It defaults to ‘html’, but can also be ‘xml’ for xhtml output, or ‘text’ to serialise to plain text without markup.To leave out the tail text of the top-level element that is being serialised, pass
with_tail=False
.The
doctype
option allows passing in a plain string that will be serialised before the XML tree. Note that passing in non well-formed content here will make the XML output non well-formed. Also, an existing doctype in the document tree will not be removed when serialising an ElementTree instance.Example:
>>> from lxml import html >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') >>> html.tostring(root) b'<p>Hello<br>world!</p>' >>> html.tostring(root, method='html') b'<p>Hello<br>world!</p>' >>> html.tostring(root, method='xml') b'<p>Hello<br/>world!</p>' >>> html.tostring(root, method='text') b'Helloworld!' >>> html.tostring(root, method='text', encoding='unicode') 'Helloworld!' >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') >>> html.tostring(root[0], method='text', encoding='unicode') 'Helloworld!TAIL' >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 'Helloworld!' >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') >>> html.tostring(doc, method='html', encoding='unicode') '<html><body><p>Hello<br>world!</p></body></html>' >>> print(html.tostring(doc, method='html', encoding='unicode', ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' ... ' "http://www.w3.org/TR/html4/strict.dtd">')) <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html><body><p>Hello<br>world!</p></body></html>