CMS 3D CMS Logo

Public Member Functions | Public Attributes | Private Member Functions

BeautifulSoup::HTMLParserBuilder Class Reference

Inherits HTMLParser::HTMLParser, and HTMLParser::HTMLParser.

List of all members.

Public Member Functions

def __init__
def __init__
def handle_charref
def handle_charref
def handle_comment
def handle_comment
def handle_data
def handle_data
def handle_decl
def handle_decl
def handle_endtag
def handle_endtag
def handle_entityref
def handle_entityref
def handle_pi
def handle_pi
def handle_starttag
def handle_starttag
def parse_declaration
def parse_declaration

Public Attributes

 soup

Private Member Functions

def _toStringSubclass
def _toStringSubclass

Detailed Description

Definition at line 1005 of file BeautifulSoup.py.


Constructor & Destructor Documentation

def BeautifulSoup::HTMLParserBuilder::__init__ (   self,
  soup 
)

Definition at line 1007 of file BeautifulSoup.py.

01008                             :
01009         HTMLParser.__init__(self)
01010         self.soup = soup

def BeautifulSoup::HTMLParserBuilder::__init__ (   self,
  soup 
)

Definition at line 1007 of file BeautifulSoup.py.

01008                             :
01009         HTMLParser.__init__(self)
01010         self.soup = soup


Member Function Documentation

def BeautifulSoup::HTMLParserBuilder::_toStringSubclass (   self,
  text,
  subclass 
) [private]
Adds a certain piece of text to the tree as a NavigableString
subclass.

Definition at line 1025 of file BeautifulSoup.py.

01026                                                :
01027         """Adds a certain piece of text to the tree as a NavigableString
01028         subclass."""
01029         self.soup.endData()
01030         self.handle_data(text)
01031         self.soup.endData(subclass)

def BeautifulSoup::HTMLParserBuilder::_toStringSubclass (   self,
  text,
  subclass 
) [private]
Adds a certain piece of text to the tree as a NavigableString
subclass.

Definition at line 1025 of file BeautifulSoup.py.

01026                                                :
01027         """Adds a certain piece of text to the tree as a NavigableString
01028         subclass."""
01029         self.soup.endData()
01030         self.handle_data(text)
01031         self.soup.endData(subclass)

def BeautifulSoup::HTMLParserBuilder::handle_charref (   self,
  ref 
)

Definition at line 1044 of file BeautifulSoup.py.

01045                                  :
01046         "Handle character references as data."
01047         if self.soup.convertEntities:
01048             data = unichr(int(ref))
01049         else:
01050             data = '&#%s;' % ref
01051         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_charref (   self,
  ref 
)

Definition at line 1044 of file BeautifulSoup.py.

01045                                  :
01046         "Handle character references as data."
01047         if self.soup.convertEntities:
01048             data = unichr(int(ref))
01049         else:
01050             data = '&#%s;' % ref
01051         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_comment (   self,
  text 
)

Definition at line 1040 of file BeautifulSoup.py.

01041                                   :
01042         "Handle comments as Comment objects."
01043         self._toStringSubclass(text, Comment)

def BeautifulSoup::HTMLParserBuilder::handle_comment (   self,
  text 
)

Definition at line 1040 of file BeautifulSoup.py.

01041                                   :
01042         "Handle comments as Comment objects."
01043         self._toStringSubclass(text, Comment)

def BeautifulSoup::HTMLParserBuilder::handle_data (   self,
  content 
)

Definition at line 1022 of file BeautifulSoup.py.

01023                                   :
01024         self.soup.handle_data(content)

def BeautifulSoup::HTMLParserBuilder::handle_data (   self,
  content 
)

Definition at line 1022 of file BeautifulSoup.py.

01023                                   :
01024         self.soup.handle_data(content)

def BeautifulSoup::HTMLParserBuilder::handle_decl (   self,
  data 
)

Definition at line 1095 of file BeautifulSoup.py.

01096                                :
01097         "Handle DOCTYPEs and the like as Declaration objects."
01098         self._toStringSubclass(data, Declaration)

def BeautifulSoup::HTMLParserBuilder::handle_decl (   self,
  data 
)

Definition at line 1095 of file BeautifulSoup.py.

01096                                :
01097         "Handle DOCTYPEs and the like as Declaration objects."
01098         self._toStringSubclass(data, Declaration)

def BeautifulSoup::HTMLParserBuilder::handle_endtag (   self,
  name 
)

Definition at line 1019 of file BeautifulSoup.py.

01020                                  :
01021         self.soup.unknown_endtag(name)

def BeautifulSoup::HTMLParserBuilder::handle_endtag (   self,
  name 
)

Definition at line 1019 of file BeautifulSoup.py.

01020                                  :
01021         self.soup.unknown_endtag(name)

def BeautifulSoup::HTMLParserBuilder::handle_entityref (   self,
  ref 
)
Handle entity references as data, possibly converting known
HTML and/or XML entity references to the corresponding Unicode
characters.

Definition at line 1052 of file BeautifulSoup.py.

01053                                    :
01054         """Handle entity references as data, possibly converting known
01055         HTML and/or XML entity references to the corresponding Unicode
01056         characters."""
01057         data = None
01058         if self.soup.convertHTMLEntities:
01059             try:
01060                 data = unichr(name2codepoint[ref])
01061             except KeyError:
01062                 pass
01063 
01064         if not data and self.soup.convertXMLEntities:
01065                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
01066 
01067         if not data and self.soup.convertHTMLEntities and \
01068             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
01069                 # TODO: We've got a problem here. We're told this is
01070                 # an entity reference, but it's not an XML entity
01071                 # reference or an HTML entity reference. Nonetheless,
01072                 # the logical thing to do is to pass it through as an
01073                 # unrecognized entity reference.
01074                 #
01075                 # Except: when the input is "&carol;" this function
01076                 # will be called with input "carol". When the input is
01077                 # "AT&T", this function will be called with input
01078                 # "T". We have no way of knowing whether a semicolon
01079                 # was present originally, so we don't know whether
01080                 # this is an unknown entity or just a misplaced
01081                 # ampersand.
01082                 #
01083                 # The more common case is a misplaced ampersand, so I
01084                 # escape the ampersand and omit the trailing semicolon.
01085                 data = "&%s" % ref
01086         if not data:
01087             # This case is different from the one above, because we
01088             # haven't already gone through a supposedly comprehensive
01089             # mapping of entities to Unicode characters. We might not
01090             # have gone through any mapping at all. So the chances are
01091             # very high that this is a real entity, and not a
01092             # misplaced ampersand.
01093             data = "&%s;" % ref
01094         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_entityref (   self,
  ref 
)
Handle entity references as data, possibly converting known
HTML and/or XML entity references to the corresponding Unicode
characters.

Definition at line 1052 of file BeautifulSoup.py.

01053                                    :
01054         """Handle entity references as data, possibly converting known
01055         HTML and/or XML entity references to the corresponding Unicode
01056         characters."""
01057         data = None
01058         if self.soup.convertHTMLEntities:
01059             try:
01060                 data = unichr(name2codepoint[ref])
01061             except KeyError:
01062                 pass
01063 
01064         if not data and self.soup.convertXMLEntities:
01065                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
01066 
01067         if not data and self.soup.convertHTMLEntities and \
01068             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
01069                 # TODO: We've got a problem here. We're told this is
01070                 # an entity reference, but it's not an XML entity
01071                 # reference or an HTML entity reference. Nonetheless,
01072                 # the logical thing to do is to pass it through as an
01073                 # unrecognized entity reference.
01074                 #
01075                 # Except: when the input is "&carol;" this function
01076                 # will be called with input "carol". When the input is
01077                 # "AT&T", this function will be called with input
01078                 # "T". We have no way of knowing whether a semicolon
01079                 # was present originally, so we don't know whether
01080                 # this is an unknown entity or just a misplaced
01081                 # ampersand.
01082                 #
01083                 # The more common case is a misplaced ampersand, so I
01084                 # escape the ampersand and omit the trailing semicolon.
01085                 data = "&%s" % ref
01086         if not data:
01087             # This case is different from the one above, because we
01088             # haven't already gone through a supposedly comprehensive
01089             # mapping of entities to Unicode characters. We might not
01090             # have gone through any mapping at all. So the chances are
01091             # very high that this is a real entity, and not a
01092             # misplaced ampersand.
01093             data = "&%s;" % ref
01094         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_pi (   self,
  text 
)
Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later.

Definition at line 1032 of file BeautifulSoup.py.

01033                              :
01034         """Handle a processing instruction as a ProcessingInstruction
01035         object, possibly one with a %SOUP-ENCODING% slot into which an
01036         encoding will be plugged later."""
01037         if text[:3] == "xml":
01038             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
01039         self._toStringSubclass(text, ProcessingInstruction)

def BeautifulSoup::HTMLParserBuilder::handle_pi (   self,
  text 
)
Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later.

Definition at line 1032 of file BeautifulSoup.py.

01033                              :
01034         """Handle a processing instruction as a ProcessingInstruction
01035         object, possibly one with a %SOUP-ENCODING% slot into which an
01036         encoding will be plugged later."""
01037         if text[:3] == "xml":
01038             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
01039         self._toStringSubclass(text, ProcessingInstruction)

def BeautifulSoup::HTMLParserBuilder::handle_starttag (   self,
  name,
  attrs 
)

Definition at line 1013 of file BeautifulSoup.py.

01014                                           :
01015         if name == 'meta':
01016             self.soup.extractCharsetFromMeta(attrs)
01017         else:
01018             self.soup.unknown_starttag(name, attrs)

def BeautifulSoup::HTMLParserBuilder::handle_starttag (   self,
  name,
  attrs 
)

Definition at line 1013 of file BeautifulSoup.py.

01014                                           :
01015         if name == 'meta':
01016             self.soup.extractCharsetFromMeta(attrs)
01017         else:
01018             self.soup.unknown_starttag(name, attrs)

def BeautifulSoup::HTMLParserBuilder::parse_declaration (   self,
  i 
)
Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object.

Definition at line 1099 of file BeautifulSoup.py.

01100                                   :
01101         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01102         declaration as a CData object."""
01103         j = None
01104         if self.rawdata[i:i+9] == '<![CDATA[':
01105              k = self.rawdata.find(']]>', i)
01106              if k == -1:
01107                  k = len(self.rawdata)
01108              data = self.rawdata[i+9:k]
01109              j = k+3
01110              self._toStringSubclass(data, CData)
01111         else:
01112             try:
01113                 j = HTMLParser.parse_declaration(self, i)
01114             except HTMLParseError:
01115                 toHandle = self.rawdata[i:]
01116                 self.handle_data(toHandle)
01117                 j = i + len(toHandle)
01118         return j
01119 

def BeautifulSoup::HTMLParserBuilder::parse_declaration (   self,
  i 
)
Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object.

Definition at line 1099 of file BeautifulSoup.py.

01100                                   :
01101         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01102         declaration as a CData object."""
01103         j = None
01104         if self.rawdata[i:i+9] == '<![CDATA[':
01105              k = self.rawdata.find(']]>', i)
01106              if k == -1:
01107                  k = len(self.rawdata)
01108              data = self.rawdata[i+9:k]
01109              j = k+3
01110              self._toStringSubclass(data, CData)
01111         else:
01112             try:
01113                 j = HTMLParser.parse_declaration(self, i)
01114             except HTMLParseError:
01115                 toHandle = self.rawdata[i:]
01116                 self.handle_data(toHandle)
01117                 j = i + len(toHandle)
01118         return j
01119 


Member Data Documentation

Definition at line 1007 of file BeautifulSoup.py.