Merge lp:~mdione/lalita/url-plugin into lp:lalita

Proposed by Marcos Dione
Status: Merged
Merged at revision: 178
Proposed branch: lp:~mdione/lalita/url-plugin
Merge into: lp:lalita
Diff against target: 113 lines (+48/-17)
1 file modified
lalita/plugins/url.py (+48/-17)
To merge this branch: bzr merge lp:~mdione/lalita/url-plugin
Reviewer Review Type Date Requested Status
laliputienses Pending
Review via email: mp+235114@code.launchpad.net

Description of the change

This patch adds basic support for compressed pages. Currently it can only handle gzip'ed pages, which is what most pages are compressed with anyways.

To post a comment you must log in.
lp:~mdione/lalita/url-plugin updated
179. By Marcos Dione

[+] also handle x-gzip. [*] fix exception handling (we're not using zlib anymore). [*] better log messages.

180. By Marcos Dione

[+] magid does not properly detect 'HTML document, UTF-8 Unicode text, with very long lines', so we must check inside application/octet-stream.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'lalita/plugins/url.py'
--- lalita/plugins/url.py 2013-05-28 18:41:49 +0000
+++ lalita/plugins/url.py 2014-09-22 22:09:57 +0000
@@ -16,6 +16,8 @@
1616
17import magic17import magic
18import chardet18import chardet
19import gzip
20import StringIO
1921
20import sqlite322import sqlite3
21import datetime23import datetime
@@ -237,16 +239,11 @@
237 g = self.content_type_re.search(page)239 g = self.content_type_re.search(page)
238 if g is not None:240 if g is not None:
239 mimetype_enc = g.groups()[0]241 mimetype_enc = g.groups()[0]
240 self.logger.debug(mimetype_enc)242 self.logger.debug('m-enc found in <meta content-type=...>: %s' % mimetype_enc)
241 # text/html; charset=utf-8243 # text/html; charset=utf-8
242 g = self.mimetype_re.search(mimetype_enc)244 mimetype, encoding= self.mimetype_enc (mimetype_enc)
243 if g is not None:
244 mimetype = g.groups()[0]
245 encoding = g.groups()[2]
246 else:
247 self.logger.warning("further mimetype detection failed: %s", mimetype_enc)
248 else:245 else:
249 self.logger.warning("no mimetype in the page")246 self.logger.warning ("no <meta content-type=...> in the page")
250247
251 return encoding248 return encoding
252249
@@ -268,29 +265,58 @@
268 '''as good as any'''265 '''as good as any'''
269 return 'utf-8'266 return 'utf-8'
270267
268 def mimetype_enc (self, mimetype_enc):
269 mimetype, encoding= None, None
270
271 g= self.mimetype_re.search (mimetype_enc)
272 if g is not None:
273 mimetype= g.groups ()[0]
274 encoding= g.groups ()[2]
275 else:
276 self.logger.warn ("mimetype detection failed: %s" % mimetype_enc)
277
278 return mimetype, encoding
279
271 def guessFile (self, page, user, channel, url, date, time):280 def guessFile (self, page, user, channel, url, date, time):
272 mimetype_enc= self.magic.buffer (page)281 mimetype_enc= self.magic.buffer (page)
273 self.logger.debug('mime type found with magic: %s', mimetype_enc)282 self.logger.debug('mime type found with magic: %s', mimetype_enc)
274 g = self.mimetype_re.search(mimetype_enc)283
275 if g is not None:284 mimetype, encoding= self.mimetype_enc (mimetype_enc)
276 mimetype= g.groups()[0]285 w_mimetype= None
277 # BUG? we throw away this detected encoding later!286 w_encoding= None
278 encoding= g.groups()[2]287
279 else:288 # if compressed, uncompress before any further tests
280 self.logger.warn ("initial mimetype detection failed: %s", mimetype_enc)289 if mimetype in ('application/gzip', 'application/x-gzip', 'application/octet-stream'):
290 self.logger.debug ('possible compressed blob found, checking inside...')
291 try:
292 page= gzip.GzipFile (fileobj=StringIO.StringIO (page)).read ()
293 except IOError, e:
294 # f= open ('page.dump', 'w+')
295 self.logger.info ("couldn't decompress page, %s", e)
296 # f.write (page)
297 # f.close ()
298 else:
299 w_mimetype= mimetype
300 w_encoding= encoding
301 mimetype_enc= self.magic.buffer (page)
302 self.logger.debug ('mime type found inside compressed blob: %s' % mimetype_enc)
303 mimetype, encoding= self.mimetype_enc (mimetype_enc)
281304
282 # xhtml detection305 # xhtml detection
283 g= self.xhtml_re.search (page)306 g= self.xhtml_re.search (page)
284307
285 # TODO: handle application/gzip, application/x-gzip, text/x-asm308 # TODO: handle text/x-asm
286309
287 # text/plain? yes, text/plain too...310 # text/plain? yes, text/plain too...
288 # see http://blog.nixternal.com/2009.03.30/where-is-ctrlaltbackspace/311 # see http://blog.nixternal.com/2009.03.30/where-is-ctrlaltbackspace/
289 # text/x-c should be some kind of xml,312 # text/x-c should be some kind of xml,
290 # but of course not everybody says the correct things313 # but of course not everybody says the correct things
291 # see http://www.cadena3.com.ar/contenido/2009/06/13/32131.asp314 # see http://www.cadena3.com.ar/contenido/2009/06/13/32131.asp
315 # and magic does not seem to correctly detect 'HTML document, UTF-8 Unicode text, with very long lines'
316 # see http://www.youtube.com/watch?v=oDPCmmZifE8&list=UU3XTzVzaHQEd30rQbuvCtTQ
292 if (mimetype in ('text/html', 'text/plain', 'text/x-c') or317 if (mimetype in ('text/html', 'text/plain', 'text/x-c') or
293 mimetype in ('text/xml', 'application/xml') and g is not None):318 mimetype in ('text/xml', 'application/xml', 'application/octet-stream') and
319 g is not None):
294320
295 g= self.title_re.search (page)321 g= self.title_re.search (page)
296 if g is not None:322 if g is not None:
@@ -334,8 +360,13 @@
334 self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)360 self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
335 title= mimetype361 title= mimetype
336 else:362 else:
363 if w_mimetype is not None:
364 # we tried to see inside a compressed blob, restore to the mimetype of that wrapper
365 mimetype= w_mimetype
366
337 self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)367 self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
338 title= mimetype368 title= mimetype
369
339 self.urlsOk += 1370 self.urlsOk += 1
340 return url, True, title371 return url, True, title
341372

Subscribers

People subscribed via source and target branches

to all changes: