Merge lp:~mdione/lalita/url-plugin into lp:lalita

Proposed by Marcos Dione
Status: Merged
Merged at revision: 178
Proposed branch: lp:~mdione/lalita/url-plugin
Merge into: lp:lalita
Diff against target: 113 lines (+48/-17)
1 file modified
lalita/plugins/url.py (+48/-17)
To merge this branch: bzr merge lp:~mdione/lalita/url-plugin
Reviewer Review Type Date Requested Status
laliputienses Pending
Review via email: mp+235114@code.launchpad.net

Description of the change

This patch adds basic support for compressed pages. Currently it can only handle gzip'ed pages, which is what most pages are compressed with anyways.

To post a comment you must log in.
lp:~mdione/lalita/url-plugin updated
179. By Marcos Dione

[+] also handle x-gzip. [*] fix exception handling (we're not using zlib anymore). [*] better log messages.

180. By Marcos Dione

[+] magid does not properly detect 'HTML document, UTF-8 Unicode text, with very long lines', so we must check inside application/octet-stream.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lalita/plugins/url.py'
2--- lalita/plugins/url.py 2013-05-28 18:41:49 +0000
3+++ lalita/plugins/url.py 2014-09-22 22:09:57 +0000
4@@ -16,6 +16,8 @@
5
6 import magic
7 import chardet
8+import gzip
9+import StringIO
10
11 import sqlite3
12 import datetime
13@@ -237,16 +239,11 @@
14 g = self.content_type_re.search(page)
15 if g is not None:
16 mimetype_enc = g.groups()[0]
17- self.logger.debug(mimetype_enc)
18+ self.logger.debug('m-enc found in <meta content-type=...>: %s' % mimetype_enc)
19 # text/html; charset=utf-8
20- g = self.mimetype_re.search(mimetype_enc)
21- if g is not None:
22- mimetype = g.groups()[0]
23- encoding = g.groups()[2]
24- else:
25- self.logger.warning("further mimetype detection failed: %s", mimetype_enc)
26+ mimetype, encoding= self.mimetype_enc (mimetype_enc)
27 else:
28- self.logger.warning("no mimetype in the page")
29+ self.logger.warning ("no <meta content-type=...> in the page")
30
31 return encoding
32
33@@ -268,29 +265,58 @@
34 '''as good as any'''
35 return 'utf-8'
36
37+ def mimetype_enc (self, mimetype_enc):
38+ mimetype, encoding= None, None
39+
40+ g= self.mimetype_re.search (mimetype_enc)
41+ if g is not None:
42+ mimetype= g.groups ()[0]
43+ encoding= g.groups ()[2]
44+ else:
45+ self.logger.warn ("mimetype detection failed: %s" % mimetype_enc)
46+
47+ return mimetype, encoding
48+
49 def guessFile (self, page, user, channel, url, date, time):
50 mimetype_enc= self.magic.buffer (page)
51 self.logger.debug('mime type found with magic: %s', mimetype_enc)
52- g = self.mimetype_re.search(mimetype_enc)
53- if g is not None:
54- mimetype= g.groups()[0]
55- # BUG? we throw away this detected encoding later!
56- encoding= g.groups()[2]
57- else:
58- self.logger.warn ("initial mimetype detection failed: %s", mimetype_enc)
59+
60+ mimetype, encoding= self.mimetype_enc (mimetype_enc)
61+ w_mimetype= None
62+ w_encoding= None
63+
64+ # if compressed, uncompress before any further tests
65+ if mimetype in ('application/gzip', 'application/x-gzip', 'application/octet-stream'):
66+ self.logger.debug ('possible compressed blob found, checking inside...')
67+ try:
68+ page= gzip.GzipFile (fileobj=StringIO.StringIO (page)).read ()
69+ except IOError, e:
70+ # f= open ('page.dump', 'w+')
71+ self.logger.info ("couldn't decompress page, %s", e)
72+ # f.write (page)
73+ # f.close ()
74+ else:
75+ w_mimetype= mimetype
76+ w_encoding= encoding
77+ mimetype_enc= self.magic.buffer (page)
78+ self.logger.debug ('mime type found inside compressed blob: %s' % mimetype_enc)
79+ mimetype, encoding= self.mimetype_enc (mimetype_enc)
80
81 # xhtml detection
82 g= self.xhtml_re.search (page)
83
84- # TODO: handle application/gzip, application/x-gzip, text/x-asm
85+ # TODO: handle text/x-asm
86
87 # text/plain? yes, text/plain too...
88 # see http://blog.nixternal.com/2009.03.30/where-is-ctrlaltbackspace/
89 # text/x-c should be some kind of xml,
90 # but of course not everybody says the correct things
91 # see http://www.cadena3.com.ar/contenido/2009/06/13/32131.asp
92+ # and magic does not seem to correctly detect 'HTML document, UTF-8 Unicode text, with very long lines'
93+ # see http://www.youtube.com/watch?v=oDPCmmZifE8&list=UU3XTzVzaHQEd30rQbuvCtTQ
94 if (mimetype in ('text/html', 'text/plain', 'text/x-c') or
95- mimetype in ('text/xml', 'application/xml') and g is not None):
96+ mimetype in ('text/xml', 'application/xml', 'application/octet-stream') and
97+ g is not None):
98
99 g= self.title_re.search (page)
100 if g is not None:
101@@ -334,8 +360,13 @@
102 self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
103 title= mimetype
104 else:
105+ if w_mimetype is not None:
106+ # we tried to see inside a compressed blob, restore to the mimetype of that wrapper
107+ mimetype= w_mimetype
108+
109 self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
110 title= mimetype
111+
112 self.urlsOk += 1
113 return url, True, title
114

Subscribers

People subscribed via source and target branches

to all changes: