55
66from .cleaners import normalize_spaces , clean_attributes
77from .encoding import get_encoding
8+ from .compat import str_
89
910utf8_parser = lxml .html .HTMLParser (encoding = 'utf-8' )
1011
11- if sys .version_info [0 ] == 2 :
12- str = unicode
13-
1412def build_doc (page ):
15- if isinstance (page , str ):
16- enc = None
17- page_unicode = page
13+ if isinstance (page , str_ ):
14+ encoding = None
15+ decoded_page = page
1816 else :
19- enc = get_encoding (page ) or 'utf-8'
20- page_unicode = page .decode (enc , 'replace' )
21- doc = lxml .html .document_fromstring (page_unicode .encode ('utf-8' , 'replace' ), parser = utf8_parser )
22- return doc , enc
17+ encoding = get_encoding (page ) or 'utf-8'
18+ decoded_page = page .decode (encoding , 'replace' )
19+
20+ # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
21+ doc = lxml .html .document_fromstring (decoded_page .encode ('utf-8' , 'replace' ), parser = utf8_parser )
22+ return doc , encoding
2323
2424def js_re (src , pattern , flags , repl ):
2525 return re .compile (pattern , flags ).sub (src , repl .replace ('$' , '\\ ' ))
2626
27-
2827def normalize_entities (cur_title ):
2928 entities = {
3029 u'\u2014 ' :'-' ,
@@ -58,6 +57,10 @@ def add_match(collection, text, orig):
5857 if text .replace ('"' , '' ) in orig .replace ('"' , '' ):
5958 collection .add (text )
6059
60+ TITLE_CSS_HEURISTICS = ['#title' , '#head' , '#heading' , '.pageTitle' ,
61+ '.news_title' , '.title' , '.head' , '.heading' ,
62+ '.contentheading' , '.small_header_red' ]
63+
6164def shorten_title (doc ):
6265 title = doc .find ('.//title' )
6366 if title is None or title .text is None or len (title .text ) == 0 :
@@ -74,7 +77,7 @@ def shorten_title(doc):
7477 if e .text_content ():
7578 add_match (candidates , e .text_content (), orig )
7679
77- for item in [ '#title' , '#head' , '#heading' , '.pageTitle' , '.news_title' , '.title' , '.head' , '.heading' , '.contentheading' , '.small_header_red' ] :
80+ for item in TITLE_CSS_HEURISTICS :
7881 for e in doc .cssselect (item ):
7982 if e .text :
8083 add_match (candidates , e .text , orig )
@@ -107,8 +110,11 @@ def shorten_title(doc):
107110 return title
108111
109112def get_body (doc ):
110- [ elem .drop_tree () for elem in doc .xpath ('.//script | .//link | .//style' ) ]
111- raw_html = str (tostring (doc .body or doc ))
113+ for elem in doc .xpath ('.//script | .//link | .//style' ):
114+ elem .drop_tree ()
115+ # tostring() always return utf-8 encoded string
116+ # FIXME: isn't better to use tounicode?
117+ raw_html = str_ (tostring (doc .body or doc ))
112118 cleaned = clean_attributes (raw_html )
113119 try :
114120 #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
0 commit comments