1
=== modified file 'lalita/plugins/url.py'
2
--- lalita/plugins/url.py	2013-05-28 18:41:49 +0000
3
+++ lalita/plugins/url.py	2014-09-22 22:09:57 +0000
4
@@ -16,6 +16,8 @@
5
16
16
6
17
import magic
17
import magic
7
18
import chardet
18
import chardet
8
19
import gzip
9
20
import StringIO
10
19
21
11
20
import sqlite3
22
import sqlite3
12
21
import datetime
23
import datetime
13
@@ -237,16 +239,11 @@
14
237
        g = self.content_type_re.search(page)
239
        g = self.content_type_re.search(page)
15
238
        if g is not None:
240
        if g is not None:
16
239
            mimetype_enc = g.groups()[0]
241
            mimetype_enc = g.groups()[0]
18
240
            self.logger.debug(mimetype_enc)
242
            self.logger.debug('m-enc found in <meta content-type=...>: %s' % mimetype_enc)
19
241
            # text/html; charset=utf-8
243
            # text/html; charset=utf-8
26
242
            g = self.mimetype_re.search(mimetype_enc)
244
            mimetype, encoding= self.mimetype_enc (mimetype_enc)
21
243
            if g is not None:
22
244
                mimetype = g.groups()[0]
23
245
                encoding = g.groups()[2]
24
246
            else:
25
247
                self.logger.warning("further mimetype detection failed: %s", mimetype_enc)
27
248
        else:
245
        else:
29
249
            self.logger.warning("no mimetype in the page")
246
            self.logger.warning ("no <meta content-type=...> in the page")
30
250
247
31
251
        return encoding
248
        return encoding
32
252
249
33
@@ -268,29 +265,58 @@
34
268
        '''as good as any'''
265
        '''as good as any'''
35
269
        return 'utf-8'
266
        return 'utf-8'
36
270
267
37
268
    def mimetype_enc (self, mimetype_enc):
38
269
        mimetype, encoding= None, None
39
270
40
271
        g= self.mimetype_re.search (mimetype_enc)
41
272
        if g is not None:
42
273
            mimetype= g.groups ()[0]
43
274
            encoding= g.groups ()[2]
44
275
        else:
45
276
            self.logger.warn ("mimetype detection failed: %s" % mimetype_enc)
46
277
47
278
        return mimetype, encoding
48
279
49
271
    def guessFile (self, page, user, channel, url, date, time):
280
    def guessFile (self, page, user, channel, url, date, time):
50
272
        mimetype_enc= self.magic.buffer (page)
281
        mimetype_enc= self.magic.buffer (page)
51
273
        self.logger.debug('mime type found with magic: %s',  mimetype_enc)
282
        self.logger.debug('mime type found with magic: %s',  mimetype_enc)
59
274
        g = self.mimetype_re.search(mimetype_enc)
283
60
275
        if g is not None:
284
        mimetype, encoding= self.mimetype_enc (mimetype_enc)
61
276
            mimetype= g.groups()[0]
285
        w_mimetype= None
62
277
            # BUG? we throw away this detected encoding later!
286
        w_encoding= None
63
278
            encoding= g.groups()[2]
287
64
279
        else:
288
        # if compressed, uncompress before any further tests
65
280
            self.logger.warn ("initial mimetype detection failed: %s", mimetype_enc)
289
        if mimetype in ('application/gzip', 'application/x-gzip', 'application/octet-stream'):
66
290
            self.logger.debug ('possible compressed blob found, checking inside...')
67
291
            try:
68
292
                page= gzip.GzipFile (fileobj=StringIO.StringIO (page)).read ()
69
293
            except IOError, e:
70
294
                # f= open ('page.dump', 'w+')
71
295
                self.logger.info ("couldn't decompress page, %s", e)
72
296
                # f.write (page)
73
297
                # f.close ()
74
298
            else:
75
299
                w_mimetype= mimetype
76
300
                w_encoding= encoding
77
301
                mimetype_enc= self.magic.buffer (page)
78
302
                self.logger.debug ('mime type found inside compressed blob: %s' % mimetype_enc)
79
303
                mimetype, encoding= self.mimetype_enc (mimetype_enc)
80
281
304
81
282
        # xhtml detection
305
        # xhtml detection
82
283
        g= self.xhtml_re.search (page)
306
        g= self.xhtml_re.search (page)
83
284
307
85
285
        # TODO: handle application/gzip, application/x-gzip, text/x-asm
308
        # TODO: handle text/x-asm
86
286
309
87
287
        # text/plain? yes, text/plain too...
310
        # text/plain? yes, text/plain too...
88
288
        # see http://blog.nixternal.com/2009.03.30/where-is-ctrlaltbackspace/
311
        # see http://blog.nixternal.com/2009.03.30/where-is-ctrlaltbackspace/
89
289
        # text/x-c should be some kind of xml,
312
        # text/x-c should be some kind of xml,
90
290
        # but of course not everybody says the correct things
313
        # but of course not everybody says the correct things
91
291
        # see http://www.cadena3.com.ar/contenido/2009/06/13/32131.asp
314
        # see http://www.cadena3.com.ar/contenido/2009/06/13/32131.asp
92
315
        # and magic does not seem to correctly detect 'HTML document, UTF-8 Unicode text, with very long lines'
93
316
        # see http://www.youtube.com/watch?v=oDPCmmZifE8&list=UU3XTzVzaHQEd30rQbuvCtTQ
94
292
        if (mimetype in ('text/html', 'text/plain', 'text/x-c') or
317
        if (mimetype in ('text/html', 'text/plain', 'text/x-c') or
96
293
                mimetype in ('text/xml', 'application/xml') and g is not None):
318
            mimetype in ('text/xml', 'application/xml', 'application/octet-stream') and
97
319
            g is not None):
98
294
320
99
295
            g= self.title_re.search (page)
321
            g= self.title_re.search (page)
100
296
            if g is not None:
322
            if g is not None:
101
@@ -334,8 +360,13 @@
102
334
                self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
360
                self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
103
335
                title= mimetype
361
                title= mimetype
104
336
        else:
362
        else:
105
363
            if w_mimetype is not None:
106
364
                # we tried to see inside a compressed blob, restore to the mimetype of that wrapper
107
365
                mimetype= w_mimetype
108
366
109
337
            self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
367
            self.addUrl (channel, user, url, mimetype=mimetype, date=date, time=time)
110
338
            title= mimetype
368
            title= mimetype
111
369
112
339
        self.urlsOk += 1
370
        self.urlsOk += 1
113
340
        return url, True, title
371
        return url, True, title
114
341
372
Reviewer	Review Type	Date Requested	Status
laliputienses		2014-09-18	Pending
Review via email: mp+235114@code.launchpad.net