|
@ -11,7 +11,7 @@ A well-formed XML/HTML document yields a well-formed data |
|
|
structure. An ill-formed XML/HTML document yields a correspondingly |
|
|
structure. An ill-formed XML/HTML document yields a correspondingly |
|
|
ill-formed data structure. If your document is only locally |
|
|
ill-formed data structure. If your document is only locally |
|
|
well-formed, you can use this library to find and process the |
|
|
well-formed, you can use this library to find and process the |
|
|
well-formed part of it. The BeautifulSoup class |
|
|
|
|
|
|
|
|
well-formed part of it. The BeautifulSoup class |
|
|
|
|
|
|
|
|
Beautiful Soup works with Python 2.2 and up. It has no external |
|
|
Beautiful Soup works with Python 2.2 and up. It has no external |
|
|
dependencies, but you'll have more success at converting data to UTF-8 |
|
|
dependencies, but you'll have more success at converting data to UTF-8 |
|
@ -24,7 +24,7 @@ if you also install these three packages: |
|
|
http://cjkpython.i18n.org/ |
|
|
http://cjkpython.i18n.org/ |
|
|
|
|
|
|
|
|
Beautiful Soup defines classes for two main parsing strategies: |
|
|
Beautiful Soup defines classes for two main parsing strategies: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific |
|
|
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific |
|
|
language that kind of looks like XML. |
|
|
language that kind of looks like XML. |
|
|
|
|
|
|
|
@ -71,7 +71,7 @@ class PageElement: |
|
|
|
|
|
|
|
|
def setup(self, parent=None, previous=None): |
|
|
def setup(self, parent=None, previous=None): |
|
|
"""Sets up the initial relations between this element and |
|
|
"""Sets up the initial relations between this element and |
|
|
other elements.""" |
|
|
|
|
|
|
|
|
other elements.""" |
|
|
self.parent = parent |
|
|
self.parent = parent |
|
|
self.previous = previous |
|
|
self.previous = previous |
|
|
self.next = None |
|
|
self.next = None |
|
@ -81,7 +81,7 @@ class PageElement: |
|
|
self.previousSibling = self.parent.contents[-1] |
|
|
self.previousSibling = self.parent.contents[-1] |
|
|
self.previousSibling.nextSibling = self |
|
|
self.previousSibling.nextSibling = self |
|
|
|
|
|
|
|
|
def replaceWith(self, replaceWith): |
|
|
|
|
|
|
|
|
def replaceWith(self, replaceWith): |
|
|
oldParent = self.parent |
|
|
oldParent = self.parent |
|
|
myIndex = self.parent.contents.index(self) |
|
|
myIndex = self.parent.contents.index(self) |
|
|
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: |
|
|
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: |
|
@ -92,11 +92,11 @@ class PageElement: |
|
|
# means that when we extract it, the index of this |
|
|
# means that when we extract it, the index of this |
|
|
# element will change. |
|
|
# element will change. |
|
|
myIndex = myIndex - 1 |
|
|
myIndex = myIndex - 1 |
|
|
self.extract() |
|
|
|
|
|
|
|
|
self.extract() |
|
|
oldParent.insert(myIndex, replaceWith) |
|
|
oldParent.insert(myIndex, replaceWith) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract(self): |
|
|
def extract(self): |
|
|
"""Destructively rips this element out of the tree.""" |
|
|
|
|
|
|
|
|
"""Destructively rips this element out of the tree.""" |
|
|
if self.parent: |
|
|
if self.parent: |
|
|
try: |
|
|
try: |
|
|
self.parent.contents.remove(self) |
|
|
self.parent.contents.remove(self) |
|
@ -105,7 +105,7 @@ class PageElement: |
|
|
|
|
|
|
|
|
#Find the two elements that would be next to each other if |
|
|
#Find the two elements that would be next to each other if |
|
|
#this element (and any children) hadn't been parsed. Connect |
|
|
#this element (and any children) hadn't been parsed. Connect |
|
|
#the two. |
|
|
|
|
|
|
|
|
#the two. |
|
|
lastChild = self._lastRecursiveChild() |
|
|
lastChild = self._lastRecursiveChild() |
|
|
nextElement = lastChild.next |
|
|
nextElement = lastChild.next |
|
|
|
|
|
|
|
@ -116,12 +116,12 @@ class PageElement: |
|
|
self.previous = None |
|
|
self.previous = None |
|
|
lastChild.next = None |
|
|
lastChild.next = None |
|
|
|
|
|
|
|
|
self.parent = None |
|
|
|
|
|
|
|
|
self.parent = None |
|
|
if self.previousSibling: |
|
|
if self.previousSibling: |
|
|
self.previousSibling.nextSibling = self.nextSibling |
|
|
self.previousSibling.nextSibling = self.nextSibling |
|
|
if self.nextSibling: |
|
|
if self.nextSibling: |
|
|
self.nextSibling.previousSibling = self.previousSibling |
|
|
self.nextSibling.previousSibling = self.previousSibling |
|
|
self.previousSibling = self.nextSibling = None |
|
|
|
|
|
|
|
|
self.previousSibling = self.nextSibling = None |
|
|
|
|
|
|
|
|
def _lastRecursiveChild(self): |
|
|
def _lastRecursiveChild(self): |
|
|
"Finds the last element beneath this object to be parsed." |
|
|
"Finds the last element beneath this object to be parsed." |
|
@ -134,12 +134,12 @@ class PageElement: |
|
|
if (isinstance(newChild, basestring) |
|
|
if (isinstance(newChild, basestring) |
|
|
or isinstance(newChild, unicode)) \ |
|
|
or isinstance(newChild, unicode)) \ |
|
|
and not isinstance(newChild, NavigableString): |
|
|
and not isinstance(newChild, NavigableString): |
|
|
newChild = NavigableString(newChild) |
|
|
|
|
|
|
|
|
newChild = NavigableString(newChild) |
|
|
|
|
|
|
|
|
position = min(position, len(self.contents)) |
|
|
position = min(position, len(self.contents)) |
|
|
if hasattr(newChild, 'parent') and newChild.parent != None: |
|
|
if hasattr(newChild, 'parent') and newChild.parent != None: |
|
|
# We're 'inserting' an element that's already one |
|
|
# We're 'inserting' an element that's already one |
|
|
# of this object's children. |
|
|
|
|
|
|
|
|
# of this object's children. |
|
|
if newChild.parent == self: |
|
|
if newChild.parent == self: |
|
|
index = self.find(newChild) |
|
|
index = self.find(newChild) |
|
|
if index and index < position: |
|
|
if index and index < position: |
|
@ -149,7 +149,7 @@ class PageElement: |
|
|
# will jump down one. |
|
|
# will jump down one. |
|
|
position = position - 1 |
|
|
position = position - 1 |
|
|
newChild.extract() |
|
|
newChild.extract() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
newChild.parent = self |
|
|
newChild.parent = self |
|
|
previousChild = None |
|
|
previousChild = None |
|
|
if position == 0: |
|
|
if position == 0: |
|
@ -161,13 +161,13 @@ class PageElement: |
|
|
newChild.previousSibling.nextSibling = newChild |
|
|
newChild.previousSibling.nextSibling = newChild |
|
|
newChild.previous = previousChild._lastRecursiveChild() |
|
|
newChild.previous = previousChild._lastRecursiveChild() |
|
|
if newChild.previous: |
|
|
if newChild.previous: |
|
|
newChild.previous.next = newChild |
|
|
|
|
|
|
|
|
newChild.previous.next = newChild |
|
|
|
|
|
|
|
|
newChildsLastElement = newChild._lastRecursiveChild() |
|
|
newChildsLastElement = newChild._lastRecursiveChild() |
|
|
|
|
|
|
|
|
if position >= len(self.contents): |
|
|
if position >= len(self.contents): |
|
|
newChild.nextSibling = None |
|
|
newChild.nextSibling = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent = self |
|
|
parent = self |
|
|
parentsNextSibling = None |
|
|
parentsNextSibling = None |
|
|
while not parentsNextSibling: |
|
|
while not parentsNextSibling: |
|
@ -180,8 +180,8 @@ class PageElement: |
|
|
else: |
|
|
else: |
|
|
newChildsLastElement.next = None |
|
|
newChildsLastElement.next = None |
|
|
else: |
|
|
else: |
|
|
nextChild = self.contents[position] |
|
|
|
|
|
newChild.nextSibling = nextChild |
|
|
|
|
|
|
|
|
nextChild = self.contents[position] |
|
|
|
|
|
newChild.nextSibling = nextChild |
|
|
if newChild.nextSibling: |
|
|
if newChild.nextSibling: |
|
|
newChild.nextSibling.previousSibling = newChild |
|
|
newChild.nextSibling.previousSibling = newChild |
|
|
newChildsLastElement.next = nextChild |
|
|
newChildsLastElement.next = nextChild |
|
@ -269,7 +269,7 @@ class PageElement: |
|
|
if l: |
|
|
if l: |
|
|
r = l[0] |
|
|
r = l[0] |
|
|
return r |
|
|
return r |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _findAll(self, name, attrs, text, limit, generator, **kwargs): |
|
|
def _findAll(self, name, attrs, text, limit, generator, **kwargs): |
|
|
"Iterates over a generator looking for things that match." |
|
|
"Iterates over a generator looking for things that match." |
|
|
|
|
|
|
|
@ -294,7 +294,7 @@ class PageElement: |
|
|
return results |
|
|
return results |
|
|
|
|
|
|
|
|
#These Generators can be used to navigate starting from both |
|
|
#These Generators can be used to navigate starting from both |
|
|
#NavigableStrings and Tags. |
|
|
|
|
|
|
|
|
#NavigableStrings and Tags. |
|
|
def nextGenerator(self): |
|
|
def nextGenerator(self): |
|
|
i = self |
|
|
i = self |
|
|
while i: |
|
|
while i: |
|
@ -328,7 +328,7 @@ class PageElement: |
|
|
# Utility methods |
|
|
# Utility methods |
|
|
def substituteEncoding(self, str, encoding=None): |
|
|
def substituteEncoding(self, str, encoding=None): |
|
|
encoding = encoding or "utf-8" |
|
|
encoding = encoding or "utf-8" |
|
|
return str.replace("%SOUP-ENCODING%", encoding) |
|
|
|
|
|
|
|
|
return str.replace("%SOUP-ENCODING%", encoding) |
|
|
|
|
|
|
|
|
def toEncoding(self, s, encoding=None): |
|
|
def toEncoding(self, s, encoding=None): |
|
|
"""Encodes an object to a string in some encoding, or to Unicode. |
|
|
"""Encodes an object to a string in some encoding, or to Unicode. |
|
@ -367,7 +367,7 @@ class NavigableString(unicode, PageElement): |
|
|
return self.encode(encoding) |
|
|
return self.encode(encoding) |
|
|
else: |
|
|
else: |
|
|
return self |
|
|
return self |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CData(NavigableString): |
|
|
class CData(NavigableString): |
|
|
|
|
|
|
|
|
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|
|
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|
@ -382,11 +382,11 @@ class ProcessingInstruction(NavigableString): |
|
|
|
|
|
|
|
|
class Comment(NavigableString): |
|
|
class Comment(NavigableString): |
|
|
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|
|
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|
|
return "<!--%s-->" % NavigableString.__str__(self, encoding) |
|
|
|
|
|
|
|
|
return "<!--%s-->" % NavigableString.__str__(self, encoding) |
|
|
|
|
|
|
|
|
class Declaration(NavigableString): |
|
|
class Declaration(NavigableString): |
|
|
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|
|
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|
|
return "<!%s>" % NavigableString.__str__(self, encoding) |
|
|
|
|
|
|
|
|
return "<!%s>" % NavigableString.__str__(self, encoding) |
|
|
|
|
|
|
|
|
class Tag(PageElement): |
|
|
class Tag(PageElement): |
|
|
|
|
|
|
|
@ -419,7 +419,7 @@ class Tag(PageElement): |
|
|
"""Returns the value of the 'key' attribute for the tag, or |
|
|
"""Returns the value of the 'key' attribute for the tag, or |
|
|
the value given for 'default' if it doesn't have that |
|
|
the value given for 'default' if it doesn't have that |
|
|
attribute.""" |
|
|
attribute.""" |
|
|
return self._getAttrMap().get(key, default) |
|
|
|
|
|
|
|
|
return self._getAttrMap().get(key, default) |
|
|
|
|
|
|
|
|
def has_key(self, key): |
|
|
def has_key(self, key): |
|
|
return self._getAttrMap().has_key(key) |
|
|
return self._getAttrMap().has_key(key) |
|
@ -444,7 +444,7 @@ class Tag(PageElement): |
|
|
"A tag is non-None even if it has no contents." |
|
|
"A tag is non-None even if it has no contents." |
|
|
return True |
|
|
return True |
|
|
|
|
|
|
|
|
def __setitem__(self, key, value): |
|
|
|
|
|
|
|
|
def __setitem__(self, key, value): |
|
|
"""Setting tag[key] sets the value of the 'key' attribute for the |
|
|
"""Setting tag[key] sets the value of the 'key' attribute for the |
|
|
tag.""" |
|
|
tag.""" |
|
|
self._getAttrMap() |
|
|
self._getAttrMap() |
|
@ -522,7 +522,7 @@ class Tag(PageElement): |
|
|
if self.attrs: |
|
|
if self.attrs: |
|
|
for key, val in self.attrs: |
|
|
for key, val in self.attrs: |
|
|
fmt = '%s="%s"' |
|
|
fmt = '%s="%s"' |
|
|
if isString(val): |
|
|
|
|
|
|
|
|
if isString(val): |
|
|
if self.containsSubstitutions and '%SOUP-ENCODING%' in val: |
|
|
if self.containsSubstitutions and '%SOUP-ENCODING%' in val: |
|
|
val = self.substituteEncoding(val, encoding) |
|
|
val = self.substituteEncoding(val, encoding) |
|
|
|
|
|
|
|
@ -555,7 +555,7 @@ class Tag(PageElement): |
|
|
val = re.sub("([<>]|&(?![^\s]+;))", |
|
|
val = re.sub("([<>]|&(?![^\s]+;))", |
|
|
lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", |
|
|
lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", |
|
|
val) |
|
|
val) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
attrs.append(fmt % (self.toEncoding(key, encoding), |
|
|
attrs.append(fmt % (self.toEncoding(key, encoding), |
|
|
self.toEncoding(val, encoding))) |
|
|
self.toEncoding(val, encoding))) |
|
|
close = '' |
|
|
close = '' |
|
@ -577,7 +577,7 @@ class Tag(PageElement): |
|
|
s = [] |
|
|
s = [] |
|
|
attributeString = '' |
|
|
attributeString = '' |
|
|
if attrs: |
|
|
if attrs: |
|
|
attributeString = ' ' + ' '.join(attrs) |
|
|
|
|
|
|
|
|
attributeString = ' ' + ' '.join(attrs) |
|
|
if prettyPrint: |
|
|
if prettyPrint: |
|
|
s.append(space) |
|
|
s.append(space) |
|
|
s.append('<%s%s%s>' % (encodedName, attributeString, close)) |
|
|
s.append('<%s%s%s>' % (encodedName, attributeString, close)) |
|
@ -609,14 +609,14 @@ class Tag(PageElement): |
|
|
elif isinstance(c, Tag): |
|
|
elif isinstance(c, Tag): |
|
|
s.append(c.__str__(encoding, prettyPrint, indentLevel)) |
|
|
s.append(c.__str__(encoding, prettyPrint, indentLevel)) |
|
|
if text and prettyPrint: |
|
|
if text and prettyPrint: |
|
|
text = text.strip() |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
if text: |
|
|
if text: |
|
|
if prettyPrint: |
|
|
if prettyPrint: |
|
|
s.append(" " * (indentLevel-1)) |
|
|
s.append(" " * (indentLevel-1)) |
|
|
s.append(text) |
|
|
s.append(text) |
|
|
if prettyPrint: |
|
|
if prettyPrint: |
|
|
s.append("\n") |
|
|
s.append("\n") |
|
|
return ''.join(s) |
|
|
|
|
|
|
|
|
return ''.join(s) |
|
|
|
|
|
|
|
|
#Soup methods |
|
|
#Soup methods |
|
|
|
|
|
|
|
@ -651,13 +651,13 @@ class Tag(PageElement): |
|
|
# Pre-3.x compatibility methods |
|
|
# Pre-3.x compatibility methods |
|
|
first = find |
|
|
first = find |
|
|
fetch = findAll |
|
|
fetch = findAll |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetchText(self, text=None, recursive=True, limit=None): |
|
|
def fetchText(self, text=None, recursive=True, limit=None): |
|
|
return self.findAll(text=text, recursive=recursive, limit=limit) |
|
|
return self.findAll(text=text, recursive=recursive, limit=limit) |
|
|
|
|
|
|
|
|
def firstText(self, text=None, recursive=True): |
|
|
def firstText(self, text=None, recursive=True): |
|
|
return self.find(text=text, recursive=recursive) |
|
|
return self.find(text=text, recursive=recursive) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#Utility methods |
|
|
#Utility methods |
|
|
|
|
|
|
|
|
def append(self, tag): |
|
|
def append(self, tag): |
|
@ -672,7 +672,7 @@ class Tag(PageElement): |
|
|
if not getattr(self, 'attrMap'): |
|
|
if not getattr(self, 'attrMap'): |
|
|
self.attrMap = {} |
|
|
self.attrMap = {} |
|
|
for (key, value) in self.attrs: |
|
|
for (key, value) in self.attrs: |
|
|
self.attrMap[key] = value |
|
|
|
|
|
|
|
|
self.attrMap[key] = value |
|
|
return self.attrMap |
|
|
return self.attrMap |
|
|
|
|
|
|
|
|
#Generator methods |
|
|
#Generator methods |
|
@ -680,12 +680,12 @@ class Tag(PageElement): |
|
|
for i in range(0, len(self.contents)): |
|
|
for i in range(0, len(self.contents)): |
|
|
yield self.contents[i] |
|
|
yield self.contents[i] |
|
|
raise StopIteration |
|
|
raise StopIteration |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recursiveChildGenerator(self): |
|
|
def recursiveChildGenerator(self): |
|
|
stack = [(self, 0)] |
|
|
stack = [(self, 0)] |
|
|
while stack: |
|
|
while stack: |
|
|
tag, start = stack.pop() |
|
|
tag, start = stack.pop() |
|
|
if isinstance(tag, Tag): |
|
|
|
|
|
|
|
|
if isinstance(tag, Tag): |
|
|
for i in range(start, len(tag.contents)): |
|
|
for i in range(start, len(tag.contents)): |
|
|
a = tag.contents[i] |
|
|
a = tag.contents[i] |
|
|
yield a |
|
|
yield a |
|
@ -720,7 +720,7 @@ class SoupStrainer: |
|
|
return self.text |
|
|
return self.text |
|
|
else: |
|
|
else: |
|
|
return "%s|%s" % (self.name, self.attrs) |
|
|
return "%s|%s" % (self.name, self.attrs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def searchTag(self, markupName=None, markupAttrs={}): |
|
|
def searchTag(self, markupName=None, markupAttrs={}): |
|
|
found = None |
|
|
found = None |
|
|
markup = None |
|
|
markup = None |
|
@ -737,7 +737,7 @@ class SoupStrainer: |
|
|
if callFunctionWithTagData: |
|
|
if callFunctionWithTagData: |
|
|
match = self.name(markupName, markupAttrs) |
|
|
match = self.name(markupName, markupAttrs) |
|
|
else: |
|
|
else: |
|
|
match = True |
|
|
|
|
|
|
|
|
match = True |
|
|
markupAttrMap = None |
|
|
markupAttrMap = None |
|
|
for attr, matchAgainst in self.attrs.items(): |
|
|
for attr, matchAgainst in self.attrs.items(): |
|
|
if not markupAttrMap: |
|
|
if not markupAttrMap: |
|
@ -762,7 +762,7 @@ class SoupStrainer: |
|
|
#print 'looking for %s in %s' % (self, markup) |
|
|
#print 'looking for %s in %s' % (self, markup) |
|
|
found = None |
|
|
found = None |
|
|
# If given a list of items, scan it for a text element that |
|
|
# If given a list of items, scan it for a text element that |
|
|
# matches. |
|
|
|
|
|
|
|
|
# matches. |
|
|
if isList(markup) and not isinstance(markup, Tag): |
|
|
if isList(markup) and not isinstance(markup, Tag): |
|
|
for element in markup: |
|
|
for element in markup: |
|
|
if isinstance(element, NavigableString) \ |
|
|
if isinstance(element, NavigableString) \ |
|
@ -783,8 +783,8 @@ class SoupStrainer: |
|
|
raise Exception, "I don't know how to match against a %s" \ |
|
|
raise Exception, "I don't know how to match against a %s" \ |
|
|
% markup.__class__ |
|
|
% markup.__class__ |
|
|
return found |
|
|
return found |
|
|
|
|
|
|
|
|
def _matches(self, markup, matchAgainst): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _matches(self, markup, matchAgainst): |
|
|
#print "Matching %s against %s" % (markup, matchAgainst) |
|
|
#print "Matching %s against %s" % (markup, matchAgainst) |
|
|
result = False |
|
|
result = False |
|
|
if matchAgainst == True and type(matchAgainst) == types.BooleanType: |
|
|
if matchAgainst == True and type(matchAgainst) == types.BooleanType: |
|
@ -835,7 +835,7 @@ def isString(s): |
|
|
"""Convenience method that works with all 2.x versions of Python |
|
|
"""Convenience method that works with all 2.x versions of Python |
|
|
to determine whether or not something is stringlike.""" |
|
|
to determine whether or not something is stringlike.""" |
|
|
try: |
|
|
try: |
|
|
return isinstance(s, unicode) or isintance(s, basestring) |
|
|
|
|
|
|
|
|
return isinstance(s, unicode) or isintance(s, basestring) |
|
|
except NameError: |
|
|
except NameError: |
|
|
return isinstance(s, str) |
|
|
return isinstance(s, str) |
|
|
|
|
|
|
|
@ -865,7 +865,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
"""This class contains the basic parser and search code. It defines |
|
|
"""This class contains the basic parser and search code. It defines |
|
|
a parser that knows nothing about tag behavior except for the |
|
|
a parser that knows nothing about tag behavior except for the |
|
|
following: |
|
|
following: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
You can't close a tag without closing all the tags it encloses. |
|
|
You can't close a tag without closing all the tags it encloses. |
|
|
That is, "<foo><bar></foo>" actually means |
|
|
That is, "<foo><bar></foo>" actually means |
|
|
"<foo><bar></bar></foo>". |
|
|
"<foo><bar></bar></foo>". |
|
@ -880,7 +880,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
|
|
|
|
|
|
XML_ENTITY_LIST = {} |
|
|
XML_ENTITY_LIST = {} |
|
|
for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): |
|
|
for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): |
|
|
XML_ENTITY_LIST[i] = True |
|
|
|
|
|
|
|
|
XML_ENTITY_LIST[i] = True |
|
|
|
|
|
|
|
|
SELF_CLOSING_TAGS = {} |
|
|
SELF_CLOSING_TAGS = {} |
|
|
NESTABLE_TAGS = {} |
|
|
NESTABLE_TAGS = {} |
|
@ -903,7 +903,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
convertEntities=None, selfClosingTags=None): |
|
|
convertEntities=None, selfClosingTags=None): |
|
|
"""The Soup object is initialized as the 'root tag', and the |
|
|
"""The Soup object is initialized as the 'root tag', and the |
|
|
provided markup (which can be a string or a file-like object) |
|
|
provided markup (which can be a string or a file-like object) |
|
|
is fed into the underlying parser. |
|
|
|
|
|
|
|
|
is fed into the underlying parser. |
|
|
|
|
|
|
|
|
sgmllib will process most bad HTML, and the BeautifulSoup |
|
|
sgmllib will process most bad HTML, and the BeautifulSoup |
|
|
class has some tricks for dealing with some HTML that kills |
|
|
class has some tricks for dealing with some HTML that kills |
|
@ -937,7 +937,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
self.smartQuotesTo = None |
|
|
self.smartQuotesTo = None |
|
|
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) |
|
|
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) |
|
|
SGMLParser.__init__(self) |
|
|
SGMLParser.__init__(self) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if hasattr(markup, 'read'): # It's a file-type object. |
|
|
if hasattr(markup, 'read'): # It's a file-type object. |
|
|
markup = markup.read() |
|
|
markup = markup.read() |
|
|
self.markup = markup |
|
|
self.markup = markup |
|
@ -947,7 +947,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
except StopParsing: |
|
|
except StopParsing: |
|
|
pass |
|
|
pass |
|
|
self.markup = None # The markup can now be GCed |
|
|
self.markup = None # The markup can now be GCed |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _feed(self, inDocumentEncoding=None): |
|
|
def _feed(self, inDocumentEncoding=None): |
|
|
# Convert the document to Unicode. |
|
|
# Convert the document to Unicode. |
|
|
markup = self.markup |
|
|
markup = self.markup |
|
@ -963,7 +963,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
if markup: |
|
|
if markup: |
|
|
if self.markupMassage: |
|
|
if self.markupMassage: |
|
|
if not isList(self.markupMassage): |
|
|
if not isList(self.markupMassage): |
|
|
self.markupMassage = self.MARKUP_MASSAGE |
|
|
|
|
|
|
|
|
self.markupMassage = self.MARKUP_MASSAGE |
|
|
for fix, m in self.markupMassage: |
|
|
for fix, m in self.markupMassage: |
|
|
markup = fix.sub(m, markup) |
|
|
markup = fix.sub(m, markup) |
|
|
self.reset() |
|
|
self.reset() |
|
@ -992,7 +992,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
self-closing tag according to this parser.""" |
|
|
self-closing tag according to this parser.""" |
|
|
return self.SELF_CLOSING_TAGS.has_key(name) \ |
|
|
return self.SELF_CLOSING_TAGS.has_key(name) \ |
|
|
or self.instanceSelfClosingTags.has_key(name) |
|
|
or self.instanceSelfClosingTags.has_key(name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reset(self): |
|
|
def reset(self): |
|
|
Tag.__init__(self, self, self.ROOT_TAG_NAME) |
|
|
Tag.__init__(self, self, self.ROOT_TAG_NAME) |
|
|
self.hidden = 1 |
|
|
self.hidden = 1 |
|
@ -1002,7 +1002,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
self.tagStack = [] |
|
|
self.tagStack = [] |
|
|
self.quoteStack = [] |
|
|
self.quoteStack = [] |
|
|
self.pushTag(self) |
|
|
self.pushTag(self) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def popTag(self): |
|
|
def popTag(self): |
|
|
tag = self.tagStack.pop() |
|
|
tag = self.tagStack.pop() |
|
|
# Tags with just one string-owning child get the child as a |
|
|
# Tags with just one string-owning child get the child as a |
|
@ -1052,7 +1052,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
the given tag.""" |
|
|
the given tag.""" |
|
|
#print "Popping to %s" % name |
|
|
#print "Popping to %s" % name |
|
|
if name == self.ROOT_TAG_NAME: |
|
|
if name == self.ROOT_TAG_NAME: |
|
|
return |
|
|
|
|
|
|
|
|
return |
|
|
|
|
|
|
|
|
numPops = 0 |
|
|
numPops = 0 |
|
|
mostRecentTag = None |
|
|
mostRecentTag = None |
|
@ -1065,7 +1065,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
|
|
|
|
|
|
for i in range(0, numPops): |
|
|
for i in range(0, numPops): |
|
|
mostRecentTag = self.popTag() |
|
|
mostRecentTag = self.popTag() |
|
|
return mostRecentTag |
|
|
|
|
|
|
|
|
return mostRecentTag |
|
|
|
|
|
|
|
|
def _smartPop(self, name): |
|
|
def _smartPop(self, name): |
|
|
|
|
|
|
|
@ -1102,7 +1102,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
and p.name in nestingResetTriggers) \ |
|
|
and p.name in nestingResetTriggers) \ |
|
|
or (nestingResetTriggers == None and isResetNesting |
|
|
or (nestingResetTriggers == None and isResetNesting |
|
|
and self.RESET_NESTING_TAGS.has_key(p.name)): |
|
|
and self.RESET_NESTING_TAGS.has_key(p.name)): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#If we encounter one of the nesting reset triggers |
|
|
#If we encounter one of the nesting reset triggers |
|
|
#peculiar to this tag, or we encounter another tag |
|
|
#peculiar to this tag, or we encounter another tag |
|
|
#that causes nesting to reset, pop up to but not |
|
|
#that causes nesting to reset, pop up to but not |
|
@ -1121,7 +1121,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
#print "<%s> is not real!" % name |
|
|
#print "<%s> is not real!" % name |
|
|
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) |
|
|
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) |
|
|
self.handle_data('<%s%s>' % (name, attrs)) |
|
|
self.handle_data('<%s%s>' % (name, attrs)) |
|
|
return |
|
|
|
|
|
|
|
|
return |
|
|
self.endData() |
|
|
self.endData() |
|
|
|
|
|
|
|
|
if not self.isSelfClosingTag(name) and not selfClosing: |
|
|
if not self.isSelfClosingTag(name) and not selfClosing: |
|
@ -1137,7 +1137,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
self.previous = tag |
|
|
self.previous = tag |
|
|
self.pushTag(tag) |
|
|
self.pushTag(tag) |
|
|
if selfClosing or self.isSelfClosingTag(name): |
|
|
if selfClosing or self.isSelfClosingTag(name): |
|
|
self.popTag() |
|
|
|
|
|
|
|
|
self.popTag() |
|
|
if name in self.QUOTE_TAGS: |
|
|
if name in self.QUOTE_TAGS: |
|
|
#print "Beginning quote (%s)" % name |
|
|
#print "Beginning quote (%s)" % name |
|
|
self.quoteStack.append(name) |
|
|
self.quoteStack.append(name) |
|
@ -1203,7 +1203,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): |
|
|
if not data: |
|
|
if not data: |
|
|
data = '&%s;' % ref |
|
|
data = '&%s;' % ref |
|
|
self.handle_data(data) |
|
|
self.handle_data(data) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def handle_decl(self, data): |
|
|
def handle_decl(self, data): |
|
|
"Handle DOCTYPEs and the like as Declaration objects." |
|
|
"Handle DOCTYPEs and the like as Declaration objects." |
|
|
self._toStringSubclass(data, Declaration) |
|
|
self._toStringSubclass(data, Declaration) |
|
@ -1286,7 +1286,7 @@ class BeautifulSoup(BeautifulStoneSoup): |
|
|
'spacer', 'link', 'frame', 'base']) |
|
|
'spacer', 'link', 'frame', 'base']) |
|
|
|
|
|
|
|
|
QUOTE_TAGS = {'script': None} |
|
|
QUOTE_TAGS = {'script': None} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#According to the HTML standard, each of these inline tags can |
|
|
#According to the HTML standard, each of these inline tags can |
|
|
#contain another tag of the same type. Furthermore, it's common |
|
|
#contain another tag of the same type. Furthermore, it's common |
|
|
#to actually use these tags this way. |
|
|
#to actually use these tags this way. |
|
@ -1298,7 +1298,7 @@ class BeautifulSoup(BeautifulStoneSoup): |
|
|
#to actually use these tags this way. |
|
|
#to actually use these tags this way. |
|
|
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] |
|
|
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] |
|
|
|
|
|
|
|
|
#Lists can contain other lists, but there are restrictions. |
|
|
|
|
|
|
|
|
#Lists can contain other lists, but there are restrictions. |
|
|
NESTABLE_LIST_TAGS = { 'ol' : [], |
|
|
NESTABLE_LIST_TAGS = { 'ol' : [], |
|
|
'ul' : [], |
|
|
'ul' : [], |
|
|
'li' : ['ul', 'ol'], |
|
|
'li' : ['ul', 'ol'], |
|
@ -1306,8 +1306,8 @@ class BeautifulSoup(BeautifulStoneSoup): |
|
|
'dd' : ['dl'], |
|
|
'dd' : ['dl'], |
|
|
'dt' : ['dl'] } |
|
|
'dt' : ['dl'] } |
|
|
|
|
|
|
|
|
#Tables can contain other tables, but there are restrictions. |
|
|
|
|
|
NESTABLE_TABLE_TAGS = {'table' : [], |
|
|
|
|
|
|
|
|
#Tables can contain other tables, but there are restrictions. |
|
|
|
|
|
NESTABLE_TABLE_TAGS = {'table' : [], |
|
|
'tr' : ['table', 'tbody', 'tfoot', 'thead'], |
|
|
'tr' : ['table', 'tbody', 'tfoot', 'thead'], |
|
|
'td' : ['tr'], |
|
|
'td' : ['tr'], |
|
|
'th' : ['tr'], |
|
|
'th' : ['tr'], |
|
@ -1377,7 +1377,7 @@ class BeautifulSoup(BeautifulStoneSoup): |
|
|
|
|
|
|
|
|
class StopParsing(Exception): |
|
|
class StopParsing(Exception): |
|
|
pass |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ICantBelieveItsBeautifulSoup(BeautifulSoup): |
|
|
class ICantBelieveItsBeautifulSoup(BeautifulSoup): |
|
|
|
|
|
|
|
|
"""The BeautifulSoup class is oriented towards skipping over |
|
|
"""The BeautifulSoup class is oriented towards skipping over |
|
@ -1423,7 +1423,7 @@ class MinimalSoup(BeautifulSoup): |
|
|
|
|
|
|
|
|
This also makes it better for subclassing than BeautifulStoneSoup |
|
|
This also makes it better for subclassing than BeautifulStoneSoup |
|
|
or BeautifulSoup.""" |
|
|
or BeautifulSoup.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RESET_NESTING_TAGS = buildTagMap('noscript') |
|
|
RESET_NESTING_TAGS = buildTagMap('noscript') |
|
|
NESTABLE_TAGS = {} |
|
|
NESTABLE_TAGS = {} |
|
|
|
|
|
|
|
@ -1453,7 +1453,7 @@ class BeautifulSOAP(BeautifulStoneSoup): |
|
|
parent = self.tagStack[-2] |
|
|
parent = self.tagStack[-2] |
|
|
parent._getAttrMap() |
|
|
parent._getAttrMap() |
|
|
if (isinstance(tag, Tag) and len(tag.contents) == 1 and |
|
|
if (isinstance(tag, Tag) and len(tag.contents) == 1 and |
|
|
isinstance(tag.contents[0], NavigableString) and |
|
|
|
|
|
|
|
|
isinstance(tag.contents[0], NavigableString) and |
|
|
not parent.attrMap.has_key(tag.name)): |
|
|
not parent.attrMap.has_key(tag.name)): |
|
|
parent[tag.name] = tag.contents[0] |
|
|
parent[tag.name] = tag.contents[0] |
|
|
BeautifulStoneSoup.popTag(self) |
|
|
BeautifulStoneSoup.popTag(self) |
|
@ -1530,9 +1530,9 @@ class UnicodeDammit: |
|
|
self.triedEncodings = [] |
|
|
self.triedEncodings = [] |
|
|
if markup == '' or isinstance(markup, unicode): |
|
|
if markup == '' or isinstance(markup, unicode): |
|
|
self.originalEncoding = None |
|
|
self.originalEncoding = None |
|
|
self.unicode = unicode(markup) |
|
|
|
|
|
|
|
|
self.unicode = unicode(markup) |
|
|
return |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
u = None |
|
|
u = None |
|
|
for proposedEncoding in overrideEncodings: |
|
|
for proposedEncoding in overrideEncodings: |
|
|
u = self._convertFrom(proposedEncoding) |
|
|
u = self._convertFrom(proposedEncoding) |
|
@ -1541,7 +1541,7 @@ class UnicodeDammit: |
|
|
for proposedEncoding in (documentEncoding, sniffedEncoding): |
|
|
for proposedEncoding in (documentEncoding, sniffedEncoding): |
|
|
u = self._convertFrom(proposedEncoding) |
|
|
u = self._convertFrom(proposedEncoding) |
|
|
if u: break |
|
|
if u: break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# If no luck and we have auto-detection library, try that: |
|
|
# If no luck and we have auto-detection library, try that: |
|
|
if not u and chardet and not isinstance(self.markup, unicode): |
|
|
if not u and chardet and not isinstance(self.markup, unicode): |
|
|
u = self._convertFrom(chardet.detect(self.markup)['encoding']) |
|
|
u = self._convertFrom(chardet.detect(self.markup)['encoding']) |
|
@ -1563,9 +1563,9 @@ class UnicodeDammit: |
|
|
sub = '&#x%s;' % sub[1] |
|
|
sub = '&#x%s;' % sub[1] |
|
|
else: |
|
|
else: |
|
|
sub = '&%s;' % sub[0] |
|
|
sub = '&%s;' % sub[0] |
|
|
return sub |
|
|
|
|
|
|
|
|
return sub |
|
|
|
|
|
|
|
|
def _convertFrom(self, proposed): |
|
|
|
|
|
|
|
|
def _convertFrom(self, proposed): |
|
|
proposed = self.find_codec(proposed) |
|
|
proposed = self.find_codec(proposed) |
|
|
if not proposed or proposed in self.triedEncodings: |
|
|
if not proposed or proposed in self.triedEncodings: |
|
|
return None |
|
|
return None |
|
@ -1584,12 +1584,12 @@ class UnicodeDammit: |
|
|
try: |
|
|
try: |
|
|
# print "Trying to convert document to %s" % proposed |
|
|
# print "Trying to convert document to %s" % proposed |
|
|
u = self._toUnicode(markup, proposed) |
|
|
u = self._toUnicode(markup, proposed) |
|
|
self.markup = u |
|
|
|
|
|
|
|
|
self.markup = u |
|
|
self.originalEncoding = proposed |
|
|
self.originalEncoding = proposed |
|
|
except Exception, e: |
|
|
except Exception, e: |
|
|
# print "That didn't work!" |
|
|
# print "That didn't work!" |
|
|
# print e |
|
|
# print e |
|
|
return None |
|
|
|
|
|
|
|
|
return None |
|
|
#print "Correct encoding: %s" % proposed |
|
|
#print "Correct encoding: %s" % proposed |
|
|
return self.markup |
|
|
return self.markup |
|
|
|
|
|
|
|
@ -1617,7 +1617,7 @@ class UnicodeDammit: |
|
|
data = data[4:] |
|
|
data = data[4:] |
|
|
newdata = unicode(data, encoding) |
|
|
newdata = unicode(data, encoding) |
|
|
return newdata |
|
|
return newdata |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _detectEncoding(self, xml_data): |
|
|
def _detectEncoding(self, xml_data): |
|
|
"""Given a document, tries to detect its XML encoding.""" |
|
|
"""Given a document, tries to detect its XML encoding.""" |
|
|
xml_encoding = sniffed_xml_encoding = None |
|
|
xml_encoding = sniffed_xml_encoding = None |
|
@ -1689,7 +1689,7 @@ class UnicodeDammit: |
|
|
or charset |
|
|
or charset |
|
|
|
|
|
|
|
|
def _codec(self, charset): |
|
|
def _codec(self, charset): |
|
|
if not charset: return charset |
|
|
|
|
|
|
|
|
if not charset: return charset |
|
|
codec = None |
|
|
codec = None |
|
|
try: |
|
|
try: |
|
|
codecs.lookup(charset) |
|
|
codecs.lookup(charset) |
|
|