Merge lp:~noskcaj/ubuntu/vivid/urlgrabber/3.10.1 into lp:ubuntu/vivid/urlgrabber

Proposed by Jackson Doak on 2014-12-13
Status: Needs review
Proposed branch: lp:~noskcaj/ubuntu/vivid/urlgrabber/3.10.1
Merge into: lp:ubuntu/vivid/urlgrabber
Diff against target: 7325 lines (+1389/-4846)
26 files modified
.pc/applied-patches (+0/-3)
.pc/grabber_fix.diff/urlgrabber/grabber.py (+0/-1730)
.pc/progress_fix.diff/urlgrabber/progress.py (+0/-755)
.pc/progress_object_callback_fix.diff/urlgrabber/grabber.py (+0/-1802)
ChangeLog (+8/-0)
MANIFEST (+2/-0)
PKG-INFO (+22/-22)
README (+1/-1)
debian/changelog (+7/-0)
debian/patches/grabber_fix.diff (+0/-236)
debian/patches/progress_fix.diff (+0/-11)
debian/patches/progress_object_callback_fix.diff (+0/-21)
debian/patches/series (+0/-3)
scripts/urlgrabber (+14/-6)
scripts/urlgrabber-ext-down (+75/-0)
setup.py (+4/-2)
test/base_test_code.py (+1/-1)
test/munittest.py (+3/-3)
test/test_byterange.py (+1/-13)
test/test_grabber.py (+2/-1)
test/test_mirror.py (+72/-1)
urlgrabber/__init__.py (+5/-4)
urlgrabber/byterange.py (+8/-8)
urlgrabber/grabber.py (+901/-152)
urlgrabber/mirror.py (+54/-11)
urlgrabber/progress.py (+209/-60)
To merge this branch: bzr merge lp:~noskcaj/ubuntu/vivid/urlgrabber/3.10.1
Reviewer Review Type Date Requested Status
Daniel Holbach 2014-12-13 Needs Fixing on 2014-12-16
Review via email: mp+244676@code.launchpad.net

Description of the change

New upstream release, upstreams some patcges

To post a comment you must log in.
Daniel Holbach (dholbach) wrote :

daniel@daydream:~/urlgrabber$ bzr merge lp:~noskcaj/ubuntu/vivid/urlgrabber/3.10.1
Unapplying quilt patches to prevent spurious conflicts
+N scripts/urlgrabber-ext-down
 M ChangeLog
 M MANIFEST
 M PKG-INFO
 M README
 M debian/changelog
-D debian/patches/grabber_fix.diff
-D debian/patches/progress_fix.diff
-D debian/patches/progress_object_callback_fix.diff
 M debian/patches/series
 M scripts/urlgrabber
 M setup.py
 M test/base_test_code.py
 M test/munittest.py
 M test/test_byterange.py
 M test/test_grabber.py
 M test/test_mirror.py
 M urlgrabber/__init__.py
 M urlgrabber/byterange.py
 M urlgrabber/grabber.py
 M urlgrabber/mirror.py
 M urlgrabber/progress.py
Text conflict in urlgrabber/grabber.py
1 conflicts encountered.
daniel@daydream:~/urlgrabber$

review: Needs Fixing

Unmerged revisions

12. By Jackson Doak on 2014-12-13

* New upstream release.
* Drop all patches, fixed upstream

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== removed file '.pc/applied-patches'
--- .pc/applied-patches 2011-08-09 17:45:08 +0000
+++ .pc/applied-patches 1970-01-01 00:00:00 +0000
@@ -1,3 +0,0 @@
1grabber_fix.diff
2progress_fix.diff
3progress_object_callback_fix.diff
40
=== removed directory '.pc/grabber_fix.diff'
=== removed directory '.pc/grabber_fix.diff/urlgrabber'
=== removed file '.pc/grabber_fix.diff/urlgrabber/grabber.py'
--- .pc/grabber_fix.diff/urlgrabber/grabber.py 2010-07-08 17:40:08 +0000
+++ .pc/grabber_fix.diff/urlgrabber/grabber.py 1970-01-01 00:00:00 +0000
@@ -1,1730 +0,0 @@
1# This library is free software; you can redistribute it and/or
2# modify it under the terms of the GNU Lesser General Public
3# License as published by the Free Software Foundation; either
4# version 2.1 of the License, or (at your option) any later version.
5#
6# This library is distributed in the hope that it will be useful,
7# but WITHOUT ANY WARRANTY; without even the implied warranty of
8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9# Lesser General Public License for more details.
10#
11# You should have received a copy of the GNU Lesser General Public
12# License along with this library; if not, write to the
13# Free Software Foundation, Inc.,
14# 59 Temple Place, Suite 330,
15# Boston, MA 02111-1307 USA
16
17# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
20
21"""A high-level cross-protocol url-grabber.
22
23GENERAL ARGUMENTS (kwargs)
24
25 Where possible, the module-level default is indicated, and legal
26 values are provided.
27
28 copy_local = 0 [0|1]
29
30 ignored except for file:// urls, in which case it specifies
31 whether urlgrab should still make a copy of the file, or simply
32 point to the existing copy. The module level default for this
33 option is 0.
34
35 close_connection = 0 [0|1]
36
37 tells URLGrabber to close the connection after a file has been
38 transfered. This is ignored unless the download happens with the
39 http keepalive handler (keepalive=1). Otherwise, the connection
40 is left open for further use. The module level default for this
41 option is 0 (keepalive connections will not be closed).
42
43 keepalive = 1 [0|1]
44
45 specifies whether keepalive should be used for HTTP/1.1 servers
46 that support it. The module level default for this option is 1
47 (keepalive is enabled).
48
49 progress_obj = None
50
51 a class instance that supports the following methods:
52 po.start(filename, url, basename, length, text)
53 # length will be None if unknown
54 po.update(read) # read == bytes read so far
55 po.end()
56
57 text = None
58
59 specifies alternative text to be passed to the progress meter
60 object. If not given, the default progress meter will use the
61 basename of the file.
62
63 throttle = 1.0
64
65 a number - if it's an int, it's the bytes/second throttle limit.
66 If it's a float, it is first multiplied by bandwidth. If throttle
67 == 0, throttling is disabled. If None, the module-level default
68 (which can be set on default_grabber.throttle) is used. See
69 BANDWIDTH THROTTLING for more information.
70
71 timeout = None
72
73 a positive float expressing the number of seconds to wait for socket
74 operations. If the value is None or 0.0, socket operations will block
75 forever. Setting this option causes urlgrabber to call the settimeout
76 method on the Socket object used for the request. See the Python
77 documentation on settimeout for more information.
78 http://www.python.org/doc/current/lib/socket-objects.html
79
80 bandwidth = 0
81
82 the nominal max bandwidth in bytes/second. If throttle is a float
83 and bandwidth == 0, throttling is disabled. If None, the
84 module-level default (which can be set on
85 default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
86 more information.
87
88 range = None
89
90 a tuple of the form (first_byte, last_byte) describing a byte
91 range to retrieve. Either or both of the values may set to
92 None. If first_byte is None, byte offset 0 is assumed. If
93 last_byte is None, the last byte available is assumed. Note that
94 the range specification is python-like in that (0,10) will yeild
95 the first 10 bytes of the file.
96
97 If set to None, no range will be used.
98
99 reget = None [None|'simple'|'check_timestamp']
100
101 whether to attempt to reget a partially-downloaded file. Reget
102 only applies to .urlgrab and (obviously) only if there is a
103 partially downloaded file. Reget has two modes:
104
105 'simple' -- the local file will always be trusted. If there
106 are 100 bytes in the local file, then the download will always
107 begin 100 bytes into the requested file.
108
109 'check_timestamp' -- the timestamp of the server file will be
110 compared to the timestamp of the local file. ONLY if the
111 local file is newer than or the same age as the server file
112 will reget be used. If the server file is newer, or the
113 timestamp is not returned, the entire file will be fetched.
114
115 NOTE: urlgrabber can do very little to verify that the partial
116 file on disk is identical to the beginning of the remote file.
117 You may want to either employ a custom "checkfunc" or simply avoid
118 using reget in situations where corruption is a concern.
119
120 user_agent = 'urlgrabber/VERSION'
121
122 a string, usually of the form 'AGENT/VERSION' that is provided to
123 HTTP servers in the User-agent header. The module level default
124 for this option is "urlgrabber/VERSION".
125
126 http_headers = None
127
128 a tuple of 2-tuples, each containing a header and value. These
129 will be used for http and https requests only. For example, you
130 can do
131 http_headers = (('Pragma', 'no-cache'),)
132
133 ftp_headers = None
134
135 this is just like http_headers, but will be used for ftp requests.
136
137 proxies = None
138
139 a dictionary that maps protocol schemes to proxy hosts. For
140 example, to use a proxy server on host "foo" port 3128 for http
141 and https URLs:
142 proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
143 note that proxy authentication information may be provided using
144 normal URL constructs:
145 proxies={ 'http' : 'http://user:host@foo:3128' }
146 Lastly, if proxies is None, the default environment settings will
147 be used.
148
149 prefix = None
150
151 a url prefix that will be prepended to all requested urls. For
152 example:
153 g = URLGrabber(prefix='http://foo.com/mirror/')
154 g.urlgrab('some/file.txt')
155 ## this will fetch 'http://foo.com/mirror/some/file.txt'
156 This option exists primarily to allow identical behavior to
157 MirrorGroup (and derived) instances. Note: a '/' will be inserted
158 if necessary, so you cannot specify a prefix that ends with a
159 partial file or directory name.
160
161 opener = None
162 No-op when using the curl backend (default)
163
164 cache_openers = True
165 No-op when using the curl backend (default)
166
167 data = None
168
169 Only relevant for the HTTP family (and ignored for other
170 protocols), this allows HTTP POSTs. When the data kwarg is
171 present (and not None), an HTTP request will automatically become
172 a POST rather than GET. This is done by direct passthrough to
173 urllib2. If you use this, you may also want to set the
174 'Content-length' and 'Content-type' headers with the http_headers
175 option. Note that python 2.2 handles the case of these
176 badly and if you do not use the proper case (shown here), your
177 values will be overridden with the defaults.
178
179 urlparser = URLParser()
180
181 The URLParser class handles pre-processing of URLs, including
182 auth-handling for user/pass encoded in http urls, file handing
183 (that is, filenames not sent as a URL), and URL quoting. If you
184 want to override any of this behavior, you can pass in a
185 replacement instance. See also the 'quote' option.
186
187 quote = None
188
189 Whether or not to quote the path portion of a url.
190 quote = 1 -> quote the URLs (they're not quoted yet)
191 quote = 0 -> do not quote them (they're already quoted)
192 quote = None -> guess what to do
193
194 This option only affects proper urls like 'file:///etc/passwd'; it
195 does not affect 'raw' filenames like '/etc/passwd'. The latter
196 will always be quoted as they are converted to URLs. Also, only
197 the path part of a url is quoted. If you need more fine-grained
198 control, you should probably subclass URLParser and pass it in via
199 the 'urlparser' option.
200
201 ssl_ca_cert = None
202
203 this option can be used if M2Crypto is available and will be
204 ignored otherwise. If provided, it will be used to create an SSL
205 context. If both ssl_ca_cert and ssl_context are provided, then
206 ssl_context will be ignored and a new context will be created from
207 ssl_ca_cert.
208
209 ssl_context = None
210
211 No-op when using the curl backend (default)
212
213
214 self.ssl_verify_peer = True
215
216 Check the server's certificate to make sure it is valid with what our CA validates
217
218 self.ssl_verify_host = True
219
220 Check the server's hostname to make sure it matches the certificate DN
221
222 self.ssl_key = None
223
224 Path to the key the client should use to connect/authenticate with
225
226 self.ssl_key_type = 'PEM'
227
228 PEM or DER - format of key
229
230 self.ssl_cert = None
231
232 Path to the ssl certificate the client should use to to authenticate with
233
234 self.ssl_cert_type = 'PEM'
235
236 PEM or DER - format of certificate
237
238 self.ssl_key_pass = None
239
240 password to access the ssl_key
241
242 self.size = None
243
244 size (in bytes) or Maximum size of the thing being downloaded.
245 This is mostly to keep us from exploding with an endless datastream
246
247 self.max_header_size = 2097152
248
249 Maximum size (in bytes) of the headers.
250
251
252RETRY RELATED ARGUMENTS
253
254 retry = None
255
256 the number of times to retry the grab before bailing. If this is
257 zero, it will retry forever. This was intentional... really, it
258 was :). If this value is not supplied or is supplied but is None
259 retrying does not occur.
260
261 retrycodes = [-1,2,4,5,6,7]
262
263 a sequence of errorcodes (values of e.errno) for which it should
264 retry. See the doc on URLGrabError for more details on this. You
265 might consider modifying a copy of the default codes rather than
266 building yours from scratch so that if the list is extended in the
267 future (or one code is split into two) you can still enjoy the
268 benefits of the default list. You can do that with something like
269 this:
270
271 retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
272 if 12 not in retrycodes:
273 retrycodes.append(12)
274
275 checkfunc = None
276
277 a function to do additional checks. This defaults to None, which
278 means no additional checking. The function should simply return
279 on a successful check. It should raise URLGrabError on an
280 unsuccessful check. Raising of any other exception will be
281 considered immediate failure and no retries will occur.
282
283 If it raises URLGrabError, the error code will determine the retry
284 behavior. Negative error numbers are reserved for use by these
285 passed in functions, so you can use many negative numbers for
286 different types of failure. By default, -1 results in a retry,
287 but this can be customized with retrycodes.
288
289 If you simply pass in a function, it will be given exactly one
290 argument: a CallbackObject instance with the .url attribute
291 defined and either .filename (for urlgrab) or .data (for urlread).
292 For urlgrab, .filename is the name of the local file. For
293 urlread, .data is the actual string data. If you need other
294 arguments passed to the callback (program state of some sort), you
295 can do so like this:
296
297 checkfunc=(function, ('arg1', 2), {'kwarg': 3})
298
299 if the downloaded file has filename /tmp/stuff, then this will
300 result in this call (for urlgrab):
301
302 function(obj, 'arg1', 2, kwarg=3)
303 # obj.filename = '/tmp/stuff'
304 # obj.url = 'http://foo.com/stuff'
305
306 NOTE: both the "args" tuple and "kwargs" dict must be present if
307 you use this syntax, but either (or both) can be empty.
308
309 failure_callback = None
310
311 The callback that gets called during retries when an attempt to
312 fetch a file fails. The syntax for specifying the callback is
313 identical to checkfunc, except for the attributes defined in the
314 CallbackObject instance. The attributes for failure_callback are:
315
316 exception = the raised exception
317 url = the url we're trying to fetch
318 tries = the number of tries so far (including this one)
319 retry = the value of the retry option
320
321 The callback is present primarily to inform the calling program of
322 the failure, but if it raises an exception (including the one it's
323 passed) that exception will NOT be caught and will therefore cause
324 future retries to be aborted.
325
326 The callback is called for EVERY failure, including the last one.
327 On the last try, the callback can raise an alternate exception,
328 but it cannot (without severe trickiness) prevent the exception
329 from being raised.
330
331 interrupt_callback = None
332
333 This callback is called if KeyboardInterrupt is received at any
334 point in the transfer. Basically, this callback can have three
335 impacts on the fetch process based on the way it exits:
336
337 1) raise no exception: the current fetch will be aborted, but
338 any further retries will still take place
339
340 2) raise a URLGrabError: if you're using a MirrorGroup, then
341 this will prompt a failover to the next mirror according to
342 the behavior of the MirrorGroup subclass. It is recommended
343 that you raise URLGrabError with code 15, 'user abort'. If
344 you are NOT using a MirrorGroup subclass, then this is the
345 same as (3).
346
347 3) raise some other exception (such as KeyboardInterrupt), which
348 will not be caught at either the grabber or mirror levels.
349 That is, it will be raised up all the way to the caller.
350
351 This callback is very similar to failure_callback. They are
352 passed the same arguments, so you could use the same function for
353 both.
354
355BANDWIDTH THROTTLING
356
357 urlgrabber supports throttling via two values: throttle and
358 bandwidth Between the two, you can either specify and absolute
359 throttle threshold or specify a theshold as a fraction of maximum
360 available bandwidth.
361
362 throttle is a number - if it's an int, it's the bytes/second
363 throttle limit. If it's a float, it is first multiplied by
364 bandwidth. If throttle == 0, throttling is disabled. If None, the
365 module-level default (which can be set with set_throttle) is used.
366
367 bandwidth is the nominal max bandwidth in bytes/second. If throttle
368 is a float and bandwidth == 0, throttling is disabled. If None, the
369 module-level default (which can be set with set_bandwidth) is used.
370
371 THROTTLING EXAMPLES:
372
373 Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
374 per second, or 12,500,000 Bytes per second. You have a number of
375 throttling options:
376
377 *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
378
379 This will limit urlgrab to use half of your available bandwidth.
380
381 *) set_throttle(6250000) # throttle is an int
382
383 This will also limit urlgrab to use half of your available
384 bandwidth, regardless of what bandwidth is set to.
385
386 *) set_throttle(6250000); set_throttle(1.0) # float
387
388 Use half your bandwidth
389
390 *) set_throttle(6250000); set_throttle(2.0) # float
391
392 Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
393
394 *) set_throttle(6250000); set_throttle(0) # throttle = 0
395
396 Disable throttling - this is more efficient than a very large
397 throttle setting.
398
399 *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
400
401 Disable throttling - this is the default when the module is loaded.
402
403 SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
404
405 While this is flexible, it's not extremely obvious to the user. I
406 suggest you implement a float throttle as a percent to make the
407 distinction between absolute and relative throttling very explicit.
408
409 Also, you may want to convert the units to something more convenient
410 than bytes/second, such as kbps or kB/s, etc.
411
412"""
413
414
415
416import os
417import sys
418import urlparse
419import time
420import string
421import urllib
422import urllib2
423import mimetools
424import thread
425import types
426import stat
427import pycurl
428from ftplib import parse150
429from StringIO import StringIO
430from httplib import HTTPException
431import socket
432from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
433
434########################################################################
435# MODULE INITIALIZATION
436########################################################################
437try:
438 exec('from ' + (__name__.split('.'))[0] + ' import __version__')
439except:
440 __version__ = '???'
441
442########################################################################
443# functions for debugging output. These functions are here because they
444# are also part of the module initialization.
445DEBUG = None
446def set_logger(DBOBJ):
447 """Set the DEBUG object. This is called by _init_default_logger when
448 the environment variable URLGRABBER_DEBUG is set, but can also be
449 called by a calling program. Basically, if the calling program uses
450 the logging module and would like to incorporate urlgrabber logging,
451 then it can do so this way. It's probably not necessary as most
452 internal logging is only for debugging purposes.
453
454 The passed-in object should be a logging.Logger instance. It will
455 be pushed into the keepalive and byterange modules if they're
456 being used. The mirror module pulls this object in on import, so
457 you will need to manually push into it. In fact, you may find it
458 tidier to simply push your logging object (or objects) into each
459 of these modules independently.
460 """
461
462 global DEBUG
463 DEBUG = DBOBJ
464
465def _init_default_logger(logspec=None):
466 '''Examines the environment variable URLGRABBER_DEBUG and creates
467 a logging object (logging.logger) based on the contents. It takes
468 the form
469
470 URLGRABBER_DEBUG=level,filename
471
472 where "level" can be either an integer or a log level from the
473 logging module (DEBUG, INFO, etc). If the integer is zero or
474 less, logging will be disabled. Filename is the filename where
475 logs will be sent. If it is "-", then stdout will be used. If
476 the filename is empty or missing, stderr will be used. If the
477 variable cannot be processed or the logging module cannot be
478 imported (python < 2.3) then logging will be disabled. Here are
479 some examples:
480
481 URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
482 URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
483 URLGRABBER_DEBUG=INFO # log info and higher to stderr
484
485 This funtion is called during module initialization. It is not
486 intended to be called from outside. The only reason it is a
487 function at all is to keep the module-level namespace tidy and to
488 collect the code into a nice block.'''
489
490 try:
491 if logspec is None:
492 logspec = os.environ['URLGRABBER_DEBUG']
493 dbinfo = logspec.split(',')
494 import logging
495 level = logging._levelNames.get(dbinfo[0], None)
496 if level is None: level = int(dbinfo[0])
497 if level < 1: raise ValueError()
498
499 formatter = logging.Formatter('%(asctime)s %(message)s')
500 if len(dbinfo) > 1: filename = dbinfo[1]
501 else: filename = ''
502 if filename == '': handler = logging.StreamHandler(sys.stderr)
503 elif filename == '-': handler = logging.StreamHandler(sys.stdout)
504 else: handler = logging.FileHandler(filename)
505 handler.setFormatter(formatter)
506 DBOBJ = logging.getLogger('urlgrabber')
507 DBOBJ.addHandler(handler)
508 DBOBJ.setLevel(level)
509 except (KeyError, ImportError, ValueError):
510 DBOBJ = None
511 set_logger(DBOBJ)
512
513def _log_package_state():
514 if not DEBUG: return
515 DEBUG.info('urlgrabber version = %s' % __version__)
516 DEBUG.info('trans function "_" = %s' % _)
517
518_init_default_logger()
519_log_package_state()
520
521
522# normally this would be from i18n or something like it ...
523def _(st):
524 return st
525
526########################################################################
527# END MODULE INITIALIZATION
528########################################################################
529
530
531
532class URLGrabError(IOError):
533 """
534 URLGrabError error codes:
535
536 URLGrabber error codes (0 -- 255)
537 0 - everything looks good (you should never see this)
538 1 - malformed url
539 2 - local file doesn't exist
540 3 - request for non-file local file (dir, etc)
541 4 - IOError on fetch
542 5 - OSError on fetch
543 6 - no content length header when we expected one
544 7 - HTTPException
545 8 - Exceeded read limit (for urlread)
546 9 - Requested byte range not satisfiable.
547 10 - Byte range requested, but range support unavailable
548 11 - Illegal reget mode
549 12 - Socket timeout
550 13 - malformed proxy url
551 14 - HTTPError (includes .code and .exception attributes)
552 15 - user abort
553 16 - error writing to local file
554
555 MirrorGroup error codes (256 -- 511)
556 256 - No more mirrors left to try
557
558 Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
559 [ this range reserved for application-specific error codes ]
560
561 Retry codes (< 0)
562 -1 - retry the download, unknown reason
563
564 Note: to test which group a code is in, you can simply do integer
565 division by 256: e.errno / 256
566
567 Negative codes are reserved for use by functions passed in to
568 retrygrab with checkfunc. The value -1 is built in as a generic
569 retry code and is already included in the retrycodes list.
570 Therefore, you can create a custom check function that simply
571 returns -1 and the fetch will be re-tried. For more customized
572 retries, you can use other negative number and include them in
573 retry-codes. This is nice for outputting useful messages about
574 what failed.
575
576 You can use these error codes like so:
577 try: urlgrab(url)
578 except URLGrabError, e:
579 if e.errno == 3: ...
580 # or
581 print e.strerror
582 # or simply
583 print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
584 """
585 def __init__(self, *args):
586 IOError.__init__(self, *args)
587 self.url = "No url specified"
588
589class CallbackObject:
590 """Container for returned callback data.
591
592 This is currently a dummy class into which urlgrabber can stuff
593 information for passing to callbacks. This way, the prototype for
594 all callbacks is the same, regardless of the data that will be
595 passed back. Any function that accepts a callback function as an
596 argument SHOULD document what it will define in this object.
597
598 It is possible that this class will have some greater
599 functionality in the future.
600 """
601 def __init__(self, **kwargs):
602 self.__dict__.update(kwargs)
603
604def urlgrab(url, filename=None, **kwargs):
605 """grab the file at <url> and make a local copy at <filename>
606 If filename is none, the basename of the url is used.
607 urlgrab returns the filename of the local file, which may be different
608 from the passed-in filename if the copy_local kwarg == 0.
609
610 See module documentation for a description of possible kwargs.
611 """
612 return default_grabber.urlgrab(url, filename, **kwargs)
613
614def urlopen(url, **kwargs):
615 """open the url and return a file object
616 If a progress object or throttle specifications exist, then
617 a special file object will be returned that supports them.
618 The file object can be treated like any other file object.
619
620 See module documentation for a description of possible kwargs.
621 """
622 return default_grabber.urlopen(url, **kwargs)
623
624def urlread(url, limit=None, **kwargs):
625 """read the url into a string, up to 'limit' bytes
626 If the limit is exceeded, an exception will be thrown. Note that urlread
627 is NOT intended to be used as a way of saying "I want the first N bytes"
628 but rather 'read the whole file into memory, but don't use too much'
629
630 See module documentation for a description of possible kwargs.
631 """
632 return default_grabber.urlread(url, limit, **kwargs)
633
634
635class URLParser:
636 """Process the URLs before passing them to urllib2.
637
638 This class does several things:
639
640 * add any prefix
641 * translate a "raw" file to a proper file: url
642 * handle any http or https auth that's encoded within the url
643 * quote the url
644
645 Only the "parse" method is called directly, and it calls sub-methods.
646
647 An instance of this class is held in the options object, which
648 means that it's easy to change the behavior by sub-classing and
649 passing the replacement in. It need only have a method like:
650
651 url, parts = urlparser.parse(url, opts)
652 """
653
654 def parse(self, url, opts):
655 """parse the url and return the (modified) url and its parts
656
657 Note: a raw file WILL be quoted when it's converted to a URL.
658 However, other urls (ones which come with a proper scheme) may
659 or may not be quoted according to opts.quote
660
661 opts.quote = 1 --> quote it
662 opts.quote = 0 --> do not quote it
663 opts.quote = None --> guess
664 """
665 quote = opts.quote
666
667 if opts.prefix:
668 url = self.add_prefix(url, opts.prefix)
669
670 parts = urlparse.urlparse(url)
671 (scheme, host, path, parm, query, frag) = parts
672
673 if not scheme or (len(scheme) == 1 and scheme in string.letters):
674 # if a scheme isn't specified, we guess that it's "file:"
675 if url[0] not in '/\\': url = os.path.abspath(url)
676 url = 'file:' + urllib.pathname2url(url)
677 parts = urlparse.urlparse(url)
678 quote = 0 # pathname2url quotes, so we won't do it again
679
680 if scheme in ['http', 'https']:
681 parts = self.process_http(parts, url)
682
683 if quote is None:
684 quote = self.guess_should_quote(parts)
685 if quote:
686 parts = self.quote(parts)
687
688 url = urlparse.urlunparse(parts)
689 return url, parts
690
691 def add_prefix(self, url, prefix):
692 if prefix[-1] == '/' or url[0] == '/':
693 url = prefix + url
694 else:
695 url = prefix + '/' + url
696 return url
697
698 def process_http(self, parts, url):
699 (scheme, host, path, parm, query, frag) = parts
700 # TODO: auth-parsing here, maybe? pycurl doesn't really need it
701 return (scheme, host, path, parm, query, frag)
702
703 def quote(self, parts):
704 """quote the URL
705
706 This method quotes ONLY the path part. If you need to quote
707 other parts, you should override this and pass in your derived
708 class. The other alternative is to quote other parts before
709 passing into urlgrabber.
710 """
711 (scheme, host, path, parm, query, frag) = parts
712 path = urllib.quote(path)
713 return (scheme, host, path, parm, query, frag)
714
715 hexvals = '0123456789ABCDEF'
716 def guess_should_quote(self, parts):
717 """
718 Guess whether we should quote a path. This amounts to
719 guessing whether it's already quoted.
720
721 find ' ' -> 1
722 find '%' -> 1
723 find '%XX' -> 0
724 else -> 1
725 """
726 (scheme, host, path, parm, query, frag) = parts
727 if ' ' in path:
728 return 1
729 ind = string.find(path, '%')
730 if ind > -1:
731 while ind > -1:
732 if len(path) < ind+3:
733 return 1
734 code = path[ind+1:ind+3].upper()
735 if code[0] not in self.hexvals or \
736 code[1] not in self.hexvals:
737 return 1
738 ind = string.find(path, '%', ind+1)
739 return 0
740 return 1
741
742class URLGrabberOptions:
743 """Class to ease kwargs handling."""
744
745 def __init__(self, delegate=None, **kwargs):
746 """Initialize URLGrabberOptions object.
747 Set default values for all options and then update options specified
748 in kwargs.
749 """
750 self.delegate = delegate
751 if delegate is None:
752 self._set_defaults()
753 self._set_attributes(**kwargs)
754
755 def __getattr__(self, name):
756 if self.delegate and hasattr(self.delegate, name):
757 return getattr(self.delegate, name)
758 raise AttributeError, name
759
760 def raw_throttle(self):
761 """Calculate raw throttle value from throttle and bandwidth
762 values.
763 """
764 if self.throttle <= 0:
765 return 0
766 elif type(self.throttle) == type(0):
767 return float(self.throttle)
768 else: # throttle is a float
769 return self.bandwidth * self.throttle
770
771 def derive(self, **kwargs):
772 """Create a derived URLGrabberOptions instance.
773 This method creates a new instance and overrides the
774 options specified in kwargs.
775 """
776 return URLGrabberOptions(delegate=self, **kwargs)
777
778 def _set_attributes(self, **kwargs):
779 """Update object attributes with those provided in kwargs."""
780 self.__dict__.update(kwargs)
781 if kwargs.has_key('range'):
782 # normalize the supplied range value
783 self.range = range_tuple_normalize(self.range)
784 if not self.reget in [None, 'simple', 'check_timestamp']:
785 raise URLGrabError(11, _('Illegal reget mode: %s') \
786 % (self.reget, ))
787
788 def _set_defaults(self):
789 """Set all options to their default values.
790 When adding new options, make sure a default is
791 provided here.
792 """
793 self.progress_obj = None
794 self.throttle = 1.0
795 self.bandwidth = 0
796 self.retry = None
797 self.retrycodes = [-1,2,4,5,6,7]
798 self.checkfunc = None
799 self.copy_local = 0
800 self.close_connection = 0
801 self.range = None
802 self.user_agent = 'urlgrabber/%s' % __version__
803 self.keepalive = 1
804 self.proxies = None
805 self.reget = None
806 self.failure_callback = None
807 self.interrupt_callback = None
808 self.prefix = None
809 self.opener = None
810 self.cache_openers = True
811 self.timeout = None
812 self.text = None
813 self.http_headers = None
814 self.ftp_headers = None
815 self.data = None
816 self.urlparser = URLParser()
817 self.quote = None
818 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
819 self.ssl_context = None # no-op in pycurl
820 self.ssl_verify_peer = True # check peer's cert for authenticityb
821 self.ssl_verify_host = True # make sure who they are and who the cert is for matches
822 self.ssl_key = None # client key
823 self.ssl_key_type = 'PEM' #(or DER)
824 self.ssl_cert = None # client cert
825 self.ssl_cert_type = 'PEM' # (or DER)
826 self.ssl_key_pass = None # password to access the key
827 self.size = None # if we know how big the thing we're getting is going
828 # to be. this is ultimately a MAXIMUM size for the file
829 self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
830
831 def __repr__(self):
832 return self.format()
833
834 def format(self, indent=' '):
835 keys = self.__dict__.keys()
836 if self.delegate is not None:
837 keys.remove('delegate')
838 keys.sort()
839 s = '{\n'
840 for k in keys:
841 s = s + indent + '%-15s: %s,\n' % \
842 (repr(k), repr(self.__dict__[k]))
843 if self.delegate:
844 df = self.delegate.format(indent + ' ')
845 s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
846 s = s + indent + '}'
847 return s
848
849class URLGrabber:
850 """Provides easy opening of URLs with a variety of options.
851
852 All options are specified as kwargs. Options may be specified when
853 the class is created and may be overridden on a per request basis.
854
855 New objects inherit default values from default_grabber.
856 """
857
858 def __init__(self, **kwargs):
859 self.opts = URLGrabberOptions(**kwargs)
860
861 def _retry(self, opts, func, *args):
862 tries = 0
863 while 1:
864 # there are only two ways out of this loop. The second has
865 # several "sub-ways"
866 # 1) via the return in the "try" block
867 # 2) by some exception being raised
868 # a) an excepton is raised that we don't "except"
869 # b) a callback raises ANY exception
870 # c) we're not retry-ing or have run out of retries
871 # d) the URLGrabError code is not in retrycodes
872 # beware of infinite loops :)
873 tries = tries + 1
874 exception = None
875 retrycode = None
876 callback = None
877 if DEBUG: DEBUG.info('attempt %i/%s: %s',
878 tries, opts.retry, args[0])
879 try:
880 r = apply(func, (opts,) + args, {})
881 if DEBUG: DEBUG.info('success')
882 return r
883 except URLGrabError, e:
884 exception = e
885 callback = opts.failure_callback
886 retrycode = e.errno
887 except KeyboardInterrupt, e:
888 exception = e
889 callback = opts.interrupt_callback
890
891 if DEBUG: DEBUG.info('exception: %s', exception)
892 if callback:
893 if DEBUG: DEBUG.info('calling callback: %s', callback)
894 cb_func, cb_args, cb_kwargs = self._make_callback(callback)
895 obj = CallbackObject(exception=exception, url=args[0],
896 tries=tries, retry=opts.retry)
897 cb_func(obj, *cb_args, **cb_kwargs)
898
899 if (opts.retry is None) or (tries == opts.retry):
900 if DEBUG: DEBUG.info('retries exceeded, re-raising')
901 raise
902
903 if (retrycode is not None) and (retrycode not in opts.retrycodes):
904 if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
905 retrycode, opts.retrycodes)
906 raise
907
908 def urlopen(self, url, **kwargs):
909 """open the url and return a file object
910 If a progress object or throttle value specified when this
911 object was created, then a special file object will be
912 returned that supports them. The file object can be treated
913 like any other file object.
914 """
915 opts = self.opts.derive(**kwargs)
916 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
917 (url,parts) = opts.urlparser.parse(url, opts)
918 def retryfunc(opts, url):
919 return PyCurlFileObject(url, filename=None, opts=opts)
920 return self._retry(opts, retryfunc, url)
921
922 def urlgrab(self, url, filename=None, **kwargs):
923 """grab the file at <url> and make a local copy at <filename>
924 If filename is none, the basename of the url is used.
925 urlgrab returns the filename of the local file, which may be
926 different from the passed-in filename if copy_local == 0.
927 """
928 opts = self.opts.derive(**kwargs)
929 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
930 (url,parts) = opts.urlparser.parse(url, opts)
931 (scheme, host, path, parm, query, frag) = parts
932 if filename is None:
933 filename = os.path.basename( urllib.unquote(path) )
934 if scheme == 'file' and not opts.copy_local:
935 # just return the name of the local file - don't make a
936 # copy currently
937 path = urllib.url2pathname(path)
938 if host:
939 path = os.path.normpath('//' + host + path)
940 if not os.path.exists(path):
941 err = URLGrabError(2,
942 _('Local file does not exist: %s') % (path, ))
943 err.url = url
944 raise err
945 elif not os.path.isfile(path):
946 err = URLGrabError(3,
947 _('Not a normal file: %s') % (path, ))
948 err.url = url
949 raise err
950
951 elif not opts.range:
952 if not opts.checkfunc is None:
953 cb_func, cb_args, cb_kwargs = \
954 self._make_callback(opts.checkfunc)
955 obj = CallbackObject()
956 obj.filename = path
957 obj.url = url
958 apply(cb_func, (obj, )+cb_args, cb_kwargs)
959 return path
960
961 def retryfunc(opts, url, filename):
962 fo = PyCurlFileObject(url, filename, opts)
963 try:
964 fo._do_grab()
965 if not opts.checkfunc is None:
966 cb_func, cb_args, cb_kwargs = \
967 self._make_callback(opts.checkfunc)
968 obj = CallbackObject()
969 obj.filename = filename
970 obj.url = url
971 apply(cb_func, (obj, )+cb_args, cb_kwargs)
972 finally:
973 fo.close()
974 return filename
975
976 return self._retry(opts, retryfunc, url, filename)
977
978 def urlread(self, url, limit=None, **kwargs):
979 """read the url into a string, up to 'limit' bytes
980 If the limit is exceeded, an exception will be thrown. Note
981 that urlread is NOT intended to be used as a way of saying
982 "I want the first N bytes" but rather 'read the whole file
983 into memory, but don't use too much'
984 """
985 opts = self.opts.derive(**kwargs)
986 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
987 (url,parts) = opts.urlparser.parse(url, opts)
988 if limit is not None:
989 limit = limit + 1
990
991 def retryfunc(opts, url, limit):
992 fo = PyCurlFileObject(url, filename=None, opts=opts)
993 s = ''
994 try:
995 # this is an unfortunate thing. Some file-like objects
996 # have a default "limit" of None, while the built-in (real)
997 # file objects have -1. They each break the other, so for
998 # now, we just force the default if necessary.
999 if limit is None: s = fo.read()
1000 else: s = fo.read(limit)
1001
1002 if not opts.checkfunc is None:
1003 cb_func, cb_args, cb_kwargs = \
1004 self._make_callback(opts.checkfunc)
1005 obj = CallbackObject()
1006 obj.data = s
1007 obj.url = url
1008 apply(cb_func, (obj, )+cb_args, cb_kwargs)
1009 finally:
1010 fo.close()
1011 return s
1012
1013 s = self._retry(opts, retryfunc, url, limit)
1014 if limit and len(s) > limit:
1015 err = URLGrabError(8,
1016 _('Exceeded limit (%i): %s') % (limit, url))
1017 err.url = url
1018 raise err
1019
1020 return s
1021
1022 def _make_callback(self, callback_obj):
1023 if callable(callback_obj):
1024 return callback_obj, (), {}
1025 else:
1026 return callback_obj
1027
1028# create the default URLGrabber used by urlXXX functions.
1029# NOTE: actual defaults are set in URLGrabberOptions
1030default_grabber = URLGrabber()
1031
1032
1033class PyCurlFileObject():
1034 def __init__(self, url, filename, opts):
1035 self.fo = None
1036 self._hdr_dump = ''
1037 self._parsed_hdr = None
1038 self.url = url
1039 self.scheme = urlparse.urlsplit(self.url)[0]
1040 self.filename = filename
1041 self.append = False
1042 self.reget_time = None
1043 self.opts = opts
1044 if self.opts.reget == 'check_timestamp':
1045 raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
1046 self._complete = False
1047 self._rbuf = ''
1048 self._rbufsize = 1024*8
1049 self._ttime = time.time()
1050 self._tsize = 0
1051 self._amount_read = 0
1052 self._reget_length = 0
1053 self._prog_running = False
1054 self._error = (None, None)
1055 self.size = None
1056 self._do_open()
1057
1058
1059 def __getattr__(self, name):
1060 """This effectively allows us to wrap at the instance level.
1061 Any attribute not found in _this_ object will be searched for
1062 in self.fo. This includes methods."""
1063
1064 if hasattr(self.fo, name):
1065 return getattr(self.fo, name)
1066 raise AttributeError, name
1067
1068 def _retrieve(self, buf):
1069 try:
1070 if not self._prog_running:
1071 if self.opts.progress_obj:
1072 size = self.size + self._reget_length
1073 self.opts.progress_obj.start(self._prog_reportname,
1074 urllib.unquote(self.url),
1075 self._prog_basename,
1076 size=size,
1077 text=self.opts.text)
1078 self._prog_running = True
1079 self.opts.progress_obj.update(self._amount_read)
1080
1081 self._amount_read += len(buf)
1082 self.fo.write(buf)
1083 return len(buf)
1084 except KeyboardInterrupt:
1085 return -1
1086
1087 def _hdr_retrieve(self, buf):
1088 if self._over_max_size(cur=len(self._hdr_dump),
1089 max_size=self.opts.max_header_size):
1090 return -1
1091 try:
1092 self._hdr_dump += buf
1093 # we have to get the size before we do the progress obj start
1094 # but we can't do that w/o making it do 2 connects, which sucks
1095 # so we cheat and stuff it in here in the hdr_retrieve
1096 if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
1097 length = buf.split(':')[1]
1098 self.size = int(length)
1099 elif self.scheme in ['ftp']:
1100 s = None
1101 if buf.startswith('213 '):
1102 s = buf[3:].strip()
1103 elif buf.startswith('150 '):
1104 s = parse150(buf)
1105 if s:
1106 self.size = int(s)
1107
1108 return len(buf)
1109 except KeyboardInterrupt:
1110 return pycurl.READFUNC_ABORT
1111
1112 def _return_hdr_obj(self):
1113 if self._parsed_hdr:
1114 return self._parsed_hdr
1115 statusend = self._hdr_dump.find('\n')
1116 hdrfp = StringIO()
1117 hdrfp.write(self._hdr_dump[statusend:])
1118 self._parsed_hdr = mimetools.Message(hdrfp)
1119 return self._parsed_hdr
1120
1121 hdr = property(_return_hdr_obj)
1122 http_code = property(fget=
1123 lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
1124
1125 def _set_opts(self, opts={}):
1126 # XXX
1127 if not opts:
1128 opts = self.opts
1129
1130
1131 # defaults we're always going to set
1132 self.curl_obj.setopt(pycurl.NOPROGRESS, False)
1133 self.curl_obj.setopt(pycurl.NOSIGNAL, True)
1134 self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
1135 self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
1136 self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
1137 self.curl_obj.setopt(pycurl.FAILONERROR, True)
1138 self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
1139
1140 if DEBUG:
1141 self.curl_obj.setopt(pycurl.VERBOSE, True)
1142 if opts.user_agent:
1143 self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
1144
1145 # maybe to be options later
1146 self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
1147 self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
1148
1149 # timeouts
1150 timeout = 300
1151 if opts.timeout:
1152 timeout = int(opts.timeout)
1153 self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
1154
1155 # ssl options
1156 if self.scheme == 'https':
1157 if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs
1158 self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
1159 self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
1160 self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
1161 self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
1162 if opts.ssl_key:
1163 self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
1164 if opts.ssl_key_type:
1165 self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
1166 if opts.ssl_cert:
1167 self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
1168 if opts.ssl_cert_type:
1169 self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
1170 if opts.ssl_key_pass:
1171 self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
1172
1173 #headers:
1174 if opts.http_headers and self.scheme in ('http', 'https'):
1175 headers = []
1176 for (tag, content) in opts.http_headers:
1177 headers.append('%s:%s' % (tag, content))
1178 self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
1179
1180 # ranges:
1181 if opts.range or opts.reget:
1182 range_str = self._build_range()
1183 if range_str:
1184 self.curl_obj.setopt(pycurl.RANGE, range_str)
1185
1186 # throttle/bandwidth
1187 if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
1188 self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
1189
1190 # proxy settings
1191 if opts.proxies:
1192 for (scheme, proxy) in opts.proxies.items():
1193 if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
1194 if scheme not in ('ftp'):
1195 continue
1196 else:
1197 if proxy == '_none_': proxy = ""
1198 self.curl_obj.setopt(pycurl.PROXY, proxy)
1199 elif self.scheme in ('http', 'https'):
1200 if scheme not in ('http', 'https'):
1201 continue
1202 else:
1203 if proxy == '_none_': proxy = ""
1204 self.curl_obj.setopt(pycurl.PROXY, proxy)
1205
1206 # FIXME username/password/auth settings
1207
1208 #posts - simple - expects the fields as they are
1209 if opts.data:
1210 self.curl_obj.setopt(pycurl.POST, True)
1211 self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
1212
1213 # our url
1214 self.curl_obj.setopt(pycurl.URL, self.url)
1215
1216
1217 def _do_perform(self):
1218 if self._complete:
1219 return
1220
1221 try:
1222 self.curl_obj.perform()
1223 except pycurl.error, e:
1224 # XXX - break some of these out a bit more clearly
1225 # to other URLGrabErrors from
1226 # http://curl.haxx.se/libcurl/c/libcurl-errors.html
1227 # this covers e.args[0] == 22 pretty well - which will be common
1228
1229 code = self.http_code
1230 errcode = e.args[0]
1231 if self._error[0]:
1232 errcode = self._error[0]
1233
1234 if errcode == 23 and code >= 200 and code < 299:
1235 err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
1236 err.url = self.url
1237
1238 # this is probably wrong but ultimately this is what happens
1239 # we have a legit http code and a pycurl 'writer failed' code
1240 # which almost always means something aborted it from outside
1241 # since we cannot know what it is -I'm banking on it being
1242 # a ctrl-c. XXXX - if there's a way of going back two raises to
1243 # figure out what aborted the pycurl process FIXME
1244 raise KeyboardInterrupt
1245
1246 elif errcode == 28:
1247 err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1248 err.url = self.url
1249 raise err
1250 elif errcode == 35:
1251 msg = _("problem making ssl connection")
1252 err = URLGrabError(14, msg)
1253 err.url = self.url
1254 raise err
1255 elif errcode == 37:
1256 msg = _("Could not open/read %s") % (self.url)
1257 err = URLGrabError(14, msg)
1258 err.url = self.url
1259 raise err
1260
1261 elif errcode == 42:
1262 err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
1263 err.url = self.url
1264 # this is probably wrong but ultimately this is what happens
1265 # we have a legit http code and a pycurl 'writer failed' code
1266 # which almost always means something aborted it from outside
1267 # since we cannot know what it is -I'm banking on it being
1268 # a ctrl-c. XXXX - if there's a way of going back two raises to
1269 # figure out what aborted the pycurl process FIXME
1270 raise KeyboardInterrupt
1271
1272 elif errcode == 58:
1273 msg = _("problem with the local client certificate")
1274 err = URLGrabError(14, msg)
1275 err.url = self.url
1276 raise err
1277
1278 elif errcode == 60:
1279 msg = _("client cert cannot be verified or client cert incorrect")
1280 err = URLGrabError(14, msg)
1281 err.url = self.url
1282 raise err
1283
1284 elif errcode == 63:
1285 if self._error[1]:
1286 msg = self._error[1]
1287 else:
1288 msg = _("Max download size exceeded on %s") % (self.url)
1289 err = URLGrabError(14, msg)
1290 err.url = self.url
1291 raise err
1292
1293 elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
1294 msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
1295 else:
1296 msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
1297 code = errcode
1298 err = URLGrabError(14, msg)
1299 err.code = code
1300 err.exception = e
1301 raise err
1302
1303 def _do_open(self):
1304 self.curl_obj = _curl_cache
1305 self.curl_obj.reset() # reset all old settings away, just in case
1306 # setup any ranges
1307 self._set_opts()
1308 self._do_grab()
1309 return self.fo
1310
1311 def _add_headers(self):
1312 pass
1313
1314 def _build_range(self):
1315 reget_length = 0
1316 rt = None
1317 if self.opts.reget and type(self.filename) in types.StringTypes:
1318 # we have reget turned on and we're dumping to a file
1319 try:
1320 s = os.stat(self.filename)
1321 except OSError:
1322 pass
1323 else:
1324 self.reget_time = s[stat.ST_MTIME]
1325 reget_length = s[stat.ST_SIZE]
1326
1327 # Set initial length when regetting
1328 self._amount_read = reget_length
1329 self._reget_length = reget_length # set where we started from, too
1330
1331 rt = reget_length, ''
1332 self.append = 1
1333
1334 if self.opts.range:
1335 rt = self.opts.range
1336 if rt[0]: rt = (rt[0] + reget_length, rt[1])
1337
1338 if rt:
1339 header = range_tuple_to_header(rt)
1340 if header:
1341 return header.split('=')[1]
1342
1343
1344
1345 def _make_request(self, req, opener):
1346 #XXXX
1347 # This doesn't do anything really, but we could use this
1348 # instead of do_open() to catch a lot of crap errors as
1349 # mstenner did before here
1350 return (self.fo, self.hdr)
1351
1352 try:
1353 if self.opts.timeout:
1354 old_to = socket.getdefaulttimeout()
1355 socket.setdefaulttimeout(self.opts.timeout)
1356 try:
1357 fo = opener.open(req)
1358 finally:
1359 socket.setdefaulttimeout(old_to)
1360 else:
1361 fo = opener.open(req)
1362 hdr = fo.info()
1363 except ValueError, e:
1364 err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
1365 err.url = self.url
1366 raise err
1367
1368 except RangeError, e:
1369 err = URLGrabError(9, _('%s on %s') % (e, self.url))
1370 err.url = self.url
1371 raise err
1372 except urllib2.HTTPError, e:
1373 new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
1374 new_e.code = e.code
1375 new_e.exception = e
1376 new_e.url = self.url
1377 raise new_e
1378 except IOError, e:
1379 if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
1380 err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1381 err.url = self.url
1382 raise err
1383 else:
1384 err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
1385 err.url = self.url
1386 raise err
1387
1388 except OSError, e:
1389 err = URLGrabError(5, _('%s on %s') % (e, self.url))
1390 err.url = self.url
1391 raise err
1392
1393 except HTTPException, e:
1394 err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
1395 (e.__class__.__name__, self.url, e))
1396 err.url = self.url
1397 raise err
1398
1399 else:
1400 return (fo, hdr)
1401
1402 def _do_grab(self):
1403 """dump the file to a filename or StringIO buffer"""
1404
1405 if self._complete:
1406 return
1407 _was_filename = False
1408 if type(self.filename) in types.StringTypes and self.filename:
1409 _was_filename = True
1410 self._prog_reportname = str(self.filename)
1411 self._prog_basename = os.path.basename(self.filename)
1412
1413 if self.append: mode = 'ab'
1414 else: mode = 'wb'
1415
1416 if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
1417 (self.filename, mode))
1418 try:
1419 self.fo = open(self.filename, mode)
1420 except IOError, e:
1421 err = URLGrabError(16, _(\
1422 'error opening local file from %s, IOError: %s') % (self.url, e))
1423 err.url = self.url
1424 raise err
1425
1426 else:
1427 self._prog_reportname = 'MEMORY'
1428 self._prog_basename = 'MEMORY'
1429
1430
1431 self.fo = StringIO()
1432 # if this is to be a tempfile instead....
1433 # it just makes crap in the tempdir
1434 #fh, self._temp_name = mkstemp()
1435 #self.fo = open(self._temp_name, 'wb')
1436
1437
1438 self._do_perform()
1439
1440
1441
1442 if _was_filename:
1443 # close it up
1444 self.fo.flush()
1445 self.fo.close()
1446 # set the time
1447 mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
1448 if mod_time != -1:
1449 os.utime(self.filename, (mod_time, mod_time))
1450 # re open it
1451 self.fo = open(self.filename, 'r')
1452 else:
1453 #self.fo = open(self._temp_name, 'r')
1454 self.fo.seek(0)
1455
1456 self._complete = True
1457
1458 def _fill_buffer(self, amt=None):
1459 """fill the buffer to contain at least 'amt' bytes by reading
1460 from the underlying file object. If amt is None, then it will
1461 read until it gets nothing more. It updates the progress meter
1462 and throttles after every self._rbufsize bytes."""
1463 # the _rbuf test is only in this first 'if' for speed. It's not
1464 # logically necessary
1465 if self._rbuf and not amt is None:
1466 L = len(self._rbuf)
1467 if amt > L:
1468 amt = amt - L
1469 else:
1470 return
1471
1472 # if we've made it here, then we don't have enough in the buffer
1473 # and we need to read more.
1474
1475 if not self._complete: self._do_grab() #XXX cheater - change on ranges
1476
1477 buf = [self._rbuf]
1478 bufsize = len(self._rbuf)
1479 while amt is None or amt:
1480 # first, delay if necessary for throttling reasons
1481 if self.opts.raw_throttle():
1482 diff = self._tsize/self.opts.raw_throttle() - \
1483 (time.time() - self._ttime)
1484 if diff > 0: time.sleep(diff)
1485 self._ttime = time.time()
1486
1487 # now read some data, up to self._rbufsize
1488 if amt is None: readamount = self._rbufsize
1489 else: readamount = min(amt, self._rbufsize)
1490 try:
1491 new = self.fo.read(readamount)
1492 except socket.error, e:
1493 err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
1494 err.url = self.url
1495 raise err
1496
1497 except socket.timeout, e:
1498 raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1499 err.url = self.url
1500 raise err
1501
1502 except IOError, e:
1503 raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
1504 err.url = self.url
1505 raise err
1506
1507 newsize = len(new)
1508 if not newsize: break # no more to read
1509
1510 if amt: amt = amt - newsize
1511 buf.append(new)
1512 bufsize = bufsize + newsize
1513 self._tsize = newsize
1514 self._amount_read = self._amount_read + newsize
1515 #if self.opts.progress_obj:
1516 # self.opts.progress_obj.update(self._amount_read)
1517
1518 self._rbuf = string.join(buf, '')
1519 return
1520
1521 def _progress_update(self, download_total, downloaded, upload_total, uploaded):
1522 if self._over_max_size(cur=self._amount_read-self._reget_length):
1523 return -1
1524
1525 try:
1526 if self._prog_running:
1527 downloaded += self._reget_length
1528 self.opts.progress_obj.update(downloaded)
1529 except KeyboardInterrupt:
1530 return -1
1531
1532 def _over_max_size(self, cur, max_size=None):
1533
1534 if not max_size:
1535 max_size = self.size
1536 if self.opts.size: # if we set an opts size use that, no matter what
1537 max_size = self.opts.size
1538 if not max_size: return False # if we have None for all of the Max then this is dumb
1539 if cur > max_size + max_size*.10:
1540
1541 msg = _("Downloaded more than max size for %s: %s > %s") \
1542 % (self.url, cur, max_size)
1543 self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
1544 return True
1545 return False
1546
1547 def _to_utf8(self, obj, errors='replace'):
1548 '''convert 'unicode' to an encoded utf-8 byte string '''
1549 # stolen from yum.i18n
1550 if isinstance(obj, unicode):
1551 obj = obj.encode('utf-8', errors)
1552 return obj
1553
1554 def read(self, amt=None):
1555 self._fill_buffer(amt)
1556 if amt is None:
1557 s, self._rbuf = self._rbuf, ''
1558 else:
1559 s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
1560 return s
1561
1562 def readline(self, limit=-1):
1563 if not self._complete: self._do_grab()
1564 return self.fo.readline()
1565
1566 i = string.find(self._rbuf, '\n')
1567 while i < 0 and not (0 < limit <= len(self._rbuf)):
1568 L = len(self._rbuf)
1569 self._fill_buffer(L + self._rbufsize)
1570 if not len(self._rbuf) > L: break
1571 i = string.find(self._rbuf, '\n', L)
1572
1573 if i < 0: i = len(self._rbuf)
1574 else: i = i+1
1575 if 0 <= limit < len(self._rbuf): i = limit
1576
1577 s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
1578 return s
1579
1580 def close(self):
1581 if self._prog_running:
1582 self.opts.progress_obj.end(self._amount_read)
1583 self.fo.close()
1584
1585
1586_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
1587
1588
1589#####################################################################
1590# DEPRECATED FUNCTIONS
1591def set_throttle(new_throttle):
1592 """Deprecated. Use: default_grabber.throttle = new_throttle"""
1593 default_grabber.throttle = new_throttle
1594
1595def set_bandwidth(new_bandwidth):
1596 """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
1597 default_grabber.bandwidth = new_bandwidth
1598
1599def set_progress_obj(new_progress_obj):
1600 """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
1601 default_grabber.progress_obj = new_progress_obj
1602
1603def set_user_agent(new_user_agent):
1604 """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
1605 default_grabber.user_agent = new_user_agent
1606
1607def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1608 progress_obj=None, throttle=None, bandwidth=None,
1609 numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
1610 """Deprecated. Use: urlgrab() with the retry arg instead"""
1611 kwargs = {'copy_local' : copy_local,
1612 'close_connection' : close_connection,
1613 'progress_obj' : progress_obj,
1614 'throttle' : throttle,
1615 'bandwidth' : bandwidth,
1616 'retry' : numtries,
1617 'retrycodes' : retrycodes,
1618 'checkfunc' : checkfunc
1619 }
1620 return urlgrab(url, filename, **kwargs)
1621
1622
1623#####################################################################
1624# TESTING
1625def _main_test():
1626 try: url, filename = sys.argv[1:3]
1627 except ValueError:
1628 print 'usage:', sys.argv[0], \
1629 '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1630 sys.exit()
1631
1632 kwargs = {}
1633 for a in sys.argv[3:]:
1634 k, v = string.split(a, '=', 1)
1635 kwargs[k] = int(v)
1636
1637 set_throttle(1.0)
1638 set_bandwidth(32 * 1024)
1639 print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
1640 default_grabber.bandwidth)
1641
1642 try: from progress import text_progress_meter
1643 except ImportError, e: pass
1644 else: kwargs['progress_obj'] = text_progress_meter()
1645
1646 try: name = apply(urlgrab, (url, filename), kwargs)
1647 except URLGrabError, e: print e
1648 else: print 'LOCAL FILE:', name
1649
1650
1651def _retry_test():
1652 try: url, filename = sys.argv[1:3]
1653 except ValueError:
1654 print 'usage:', sys.argv[0], \
1655 '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1656 sys.exit()
1657
1658 kwargs = {}
1659 for a in sys.argv[3:]:
1660 k, v = string.split(a, '=', 1)
1661 kwargs[k] = int(v)
1662
1663 try: from progress import text_progress_meter
1664 except ImportError, e: pass
1665 else: kwargs['progress_obj'] = text_progress_meter()
1666
1667 def cfunc(filename, hello, there='foo'):
1668 print hello, there
1669 import random
1670 rnum = random.random()
1671 if rnum < .5:
1672 print 'forcing retry'
1673 raise URLGrabError(-1, 'forcing retry')
1674 if rnum < .75:
1675 print 'forcing failure'
1676 raise URLGrabError(-2, 'forcing immediate failure')
1677 print 'success'
1678 return
1679
1680 kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
1681 try: name = apply(retrygrab, (url, filename), kwargs)
1682 except URLGrabError, e: print e
1683 else: print 'LOCAL FILE:', name
1684
1685def _file_object_test(filename=None):
1686 import cStringIO
1687 if filename is None:
1688 filename = __file__
1689 print 'using file "%s" for comparisons' % filename
1690 fo = open(filename)
1691 s_input = fo.read()
1692 fo.close()
1693
1694 for testfunc in [_test_file_object_smallread,
1695 _test_file_object_readall,
1696 _test_file_object_readline,
1697 _test_file_object_readlines]:
1698 fo_input = cStringIO.StringIO(s_input)
1699 fo_output = cStringIO.StringIO()
1700 wrapper = PyCurlFileObject(fo_input, None, 0)
1701 print 'testing %-30s ' % testfunc.__name__,
1702 testfunc(wrapper, fo_output)
1703 s_output = fo_output.getvalue()
1704 if s_output == s_input: print 'passed'
1705 else: print 'FAILED'
1706
1707def _test_file_object_smallread(wrapper, fo_output):
1708 while 1:
1709 s = wrapper.read(23)
1710 fo_output.write(s)
1711 if not s: return
1712
1713def _test_file_object_readall(wrapper, fo_output):
1714 s = wrapper.read()
1715 fo_output.write(s)
1716
1717def _test_file_object_readline(wrapper, fo_output):
1718 while 1:
1719 s = wrapper.readline()
1720 fo_output.write(s)
1721 if not s: return
1722
1723def _test_file_object_readlines(wrapper, fo_output):
1724 li = wrapper.readlines()
1725 fo_output.write(string.join(li, ''))
1726
1727if __name__ == '__main__':
1728 _main_test()
1729 _retry_test()
1730 _file_object_test('test')
17310
=== removed directory '.pc/progress_fix.diff'
=== removed directory '.pc/progress_fix.diff/urlgrabber'
=== removed file '.pc/progress_fix.diff/urlgrabber/progress.py'
--- .pc/progress_fix.diff/urlgrabber/progress.py 2010-07-08 17:40:08 +0000
+++ .pc/progress_fix.diff/urlgrabber/progress.py 1970-01-01 00:00:00 +0000
@@ -1,755 +0,0 @@
1# This library is free software; you can redistribute it and/or
2# modify it under the terms of the GNU Lesser General Public
3# License as published by the Free Software Foundation; either
4# version 2.1 of the License, or (at your option) any later version.
5#
6# This library is distributed in the hope that it will be useful,
7# but WITHOUT ANY WARRANTY; without even the implied warranty of
8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9# Lesser General Public License for more details.
10#
11# You should have received a copy of the GNU Lesser General Public
12# License along with this library; if not, write to the
13# Free Software Foundation, Inc.,
14# 59 Temple Place, Suite 330,
15# Boston, MA 02111-1307 USA
16
17# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19
20
21import sys
22import time
23import math
24import thread
25import fcntl
26import struct
27import termios
28
29# Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html
30def terminal_width(fd=1):
31 """ Get the real terminal width """
32 try:
33 buf = 'abcdefgh'
34 buf = fcntl.ioctl(fd, termios.TIOCGWINSZ, buf)
35 ret = struct.unpack('hhhh', buf)[1]
36 if ret == 0:
37 return 80
38 # Add minimum too?
39 return ret
40 except: # IOError
41 return 80
42
43_term_width_val = None
44_term_width_last = None
45def terminal_width_cached(fd=1, cache_timeout=1.000):
46 """ Get the real terminal width, but cache it for a bit. """
47 global _term_width_val
48 global _term_width_last
49
50 now = time.time()
51 if _term_width_val is None or (now - _term_width_last) > cache_timeout:
52 _term_width_val = terminal_width(fd)
53 _term_width_last = now
54 return _term_width_val
55
56class TerminalLine:
57 """ Help create dynamic progress bars, uses terminal_width_cached(). """
58
59 def __init__(self, min_rest=0, beg_len=None, fd=1, cache_timeout=1.000):
60 if beg_len is None:
61 beg_len = min_rest
62 self._min_len = min_rest
63 self._llen = terminal_width_cached(fd, cache_timeout)
64 if self._llen < beg_len:
65 self._llen = beg_len
66 self._fin = False
67
68 def __len__(self):
69 """ Usable length for elements. """
70 return self._llen - self._min_len
71
72 def rest_split(self, fixed, elements=2):
73 """ After a fixed length, split the rest of the line length among
74 a number of different elements (default=2). """
75 if self._llen < fixed:
76 return 0
77 return (self._llen - fixed) / elements
78
79 def add(self, element, full_len=None):
80 """ If there is room left in the line, above min_len, add element.
81 Note that as soon as one add fails all the rest will fail too. """
82
83 if full_len is None:
84 full_len = len(element)
85 if len(self) < full_len:
86 self._fin = True
87 if self._fin:
88 return ''
89
90 self._llen -= len(element)
91 return element
92
93 def rest(self):
94 """ Current rest of line, same as .rest_split(fixed=0, elements=1). """
95 return self._llen
96
97class BaseMeter:
98 def __init__(self):
99 self.update_period = 0.3 # seconds
100
101 self.filename = None
102 self.url = None
103 self.basename = None
104 self.text = None
105 self.size = None
106 self.start_time = None
107 self.last_amount_read = 0
108 self.last_update_time = None
109 self.re = RateEstimator()
110
111 def start(self, filename=None, url=None, basename=None,
112 size=None, now=None, text=None):
113 self.filename = filename
114 self.url = url
115 self.basename = basename
116 self.text = text
117
118 #size = None ######### TESTING
119 self.size = size
120 if not size is None: self.fsize = format_number(size) + 'B'
121
122 if now is None: now = time.time()
123 self.start_time = now
124 self.re.start(size, now)
125 self.last_amount_read = 0
126 self.last_update_time = now
127 self._do_start(now)
128
129 def _do_start(self, now=None):
130 pass
131
132 def update(self, amount_read, now=None):
133 # for a real gui, you probably want to override and put a call
134 # to your mainloop iteration function here
135 if now is None: now = time.time()
136 if (now >= self.last_update_time + self.update_period) or \
137 not self.last_update_time:
138 self.re.update(amount_read, now)
139 self.last_amount_read = amount_read
140 self.last_update_time = now
141 self._do_update(amount_read, now)
142
143 def _do_update(self, amount_read, now=None):
144 pass
145
146 def end(self, amount_read, now=None):
147 if now is None: now = time.time()
148 self.re.update(amount_read, now)
149 self.last_amount_read = amount_read
150 self.last_update_time = now
151 self._do_end(amount_read, now)
152
153 def _do_end(self, amount_read, now=None):
154 pass
155
156# This is kind of a hack, but progress is gotten from grabber which doesn't
157# know about the total size to download. So we do this so we can get the data
158# out of band here. This will be "fixed" one way or anther soon.
159_text_meter_total_size = 0
160_text_meter_sofar_size = 0
161def text_meter_total_size(size, downloaded=0):
162 global _text_meter_total_size
163 global _text_meter_sofar_size
164 _text_meter_total_size = size
165 _text_meter_sofar_size = downloaded
166
167#
168# update: No size (minimal: 17 chars)
169# -----------------------------------
170# <text> <rate> | <current size> <elapsed time>
171# 8-48 1 8 3 6 1 9 5
172#
173# Order: 1. <text>+<current size> (17)
174# 2. +<elapsed time> (10, total: 27)
175# 3. + ( 5, total: 32)
176# 4. +<rate> ( 9, total: 41)
177#
178# update: Size, Single file
179# -------------------------
180# <text> <pc> <bar> <rate> | <current size> <eta time> ETA
181# 8-25 1 3-4 1 6-16 1 8 3 6 1 9 1 3 1
182#
183# Order: 1. <text>+<current size> (17)
184# 2. +<eta time> (10, total: 27)
185# 3. +ETA ( 5, total: 32)
186# 4. +<pc> ( 4, total: 36)
187# 5. +<rate> ( 9, total: 45)
188# 6. +<bar> ( 7, total: 52)
189#
190# update: Size, All files
191# -----------------------
192# <text> <total pc> <pc> <bar> <rate> | <current size> <eta time> ETA
193# 8-22 1 5-7 1 3-4 1 6-12 1 8 3 6 1 9 1 3 1
194#
195# Order: 1. <text>+<current size> (17)
196# 2. +<eta time> (10, total: 27)
197# 3. +ETA ( 5, total: 32)
198# 4. +<total pc> ( 5, total: 37)
199# 4. +<pc> ( 4, total: 41)
200# 5. +<rate> ( 9, total: 50)
201# 6. +<bar> ( 7, total: 57)
202#
203# end
204# ---
205# <text> | <current size> <elapsed time>
206# 8-56 3 6 1 9 5
207#
208# Order: 1. <text> ( 8)
209# 2. +<current size> ( 9, total: 17)
210# 3. +<elapsed time> (10, total: 27)
211# 4. + ( 5, total: 32)
212#
213
214class TextMeter(BaseMeter):
215 def __init__(self, fo=sys.stderr):
216 BaseMeter.__init__(self)
217 self.fo = fo
218
219 def _do_update(self, amount_read, now=None):
220 etime = self.re.elapsed_time()
221 fetime = format_time(etime)
222 fread = format_number(amount_read)
223 #self.size = None
224 if self.text is not None:
225 text = self.text
226 else:
227 text = self.basename
228
229 ave_dl = format_number(self.re.average_rate())
230 sofar_size = None
231 if _text_meter_total_size:
232 sofar_size = _text_meter_sofar_size + amount_read
233 sofar_pc = (sofar_size * 100) / _text_meter_total_size
234
235 # Include text + ui_rate in minimal
236 tl = TerminalLine(8, 8+1+8)
237 ui_size = tl.add(' | %5sB' % fread)
238 if self.size is None:
239 ui_time = tl.add(' %9s' % fetime)
240 ui_end = tl.add(' ' * 5)
241 ui_rate = tl.add(' %5sB/s' % ave_dl)
242 out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
243 ui_rate, ui_size, ui_time, ui_end)
244 else:
245 rtime = self.re.remaining_time()
246 frtime = format_time(rtime)
247 frac = self.re.fraction_read()
248
249 ui_time = tl.add(' %9s' % frtime)
250 ui_end = tl.add(' ETA ')
251
252 if sofar_size is None:
253 ui_sofar_pc = ''
254 else:
255 ui_sofar_pc = tl.add(' (%i%%)' % sofar_pc,
256 full_len=len(" (100%)"))
257
258 ui_pc = tl.add(' %2i%%' % (frac*100))
259 ui_rate = tl.add(' %5sB/s' % ave_dl)
260 # Make text grow a bit before we start growing the bar too
261 blen = 4 + tl.rest_split(8 + 8 + 4)
262 bar = '='*int(blen * frac)
263 if (blen * frac) - int(blen * frac) >= 0.5:
264 bar += '-'
265 ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
266 out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
267 ui_sofar_pc, ui_pc, ui_bar,
268 ui_rate, ui_size, ui_time, ui_end)
269
270 self.fo.write(out)
271 self.fo.flush()
272
273 def _do_end(self, amount_read, now=None):
274 global _text_meter_total_size
275 global _text_meter_sofar_size
276
277 total_time = format_time(self.re.elapsed_time())
278 total_size = format_number(amount_read)
279 if self.text is not None:
280 text = self.text
281 else:
282 text = self.basename
283
284 tl = TerminalLine(8)
285 ui_size = tl.add(' | %5sB' % total_size)
286 ui_time = tl.add(' %9s' % total_time)
287 not_done = self.size is not None and amount_read != self.size
288 if not_done:
289 ui_end = tl.add(' ... ')
290 else:
291 ui_end = tl.add(' ' * 5)
292
293 out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
294 ui_size, ui_time, ui_end)
295 self.fo.write(out)
296 self.fo.flush()
297
298 # Don't add size to the sofar size until we have all of it.
299 # If we don't have a size, then just pretend/hope we got all of it.
300 if not_done:
301 return
302
303 if _text_meter_total_size:
304 _text_meter_sofar_size += amount_read
305 if _text_meter_total_size <= _text_meter_sofar_size:
306 _text_meter_total_size = 0
307 _text_meter_sofar_size = 0
308
309text_progress_meter = TextMeter
310
311class MultiFileHelper(BaseMeter):
312 def __init__(self, master):
313 BaseMeter.__init__(self)
314 self.master = master
315
316 def _do_start(self, now):
317 self.master.start_meter(self, now)
318
319 def _do_update(self, amount_read, now):
320 # elapsed time since last update
321 self.master.update_meter(self, now)
322
323 def _do_end(self, amount_read, now):
324 self.ftotal_time = format_time(now - self.start_time)
325 self.ftotal_size = format_number(self.last_amount_read)
326 self.master.end_meter(self, now)
327
328 def failure(self, message, now=None):
329 self.master.failure_meter(self, message, now)
330
331 def message(self, message):
332 self.master.message_meter(self, message)
333
334class MultiFileMeter:
335 helperclass = MultiFileHelper
336 def __init__(self):
337 self.meters = []
338 self.in_progress_meters = []
339 self._lock = thread.allocate_lock()
340 self.update_period = 0.3 # seconds
341
342 self.numfiles = None
343 self.finished_files = 0
344 self.failed_files = 0
345 self.open_files = 0
346 self.total_size = None
347 self.failed_size = 0
348 self.start_time = None
349 self.finished_file_size = 0
350 self.last_update_time = None
351 self.re = RateEstimator()
352
353 def start(self, numfiles=None, total_size=None, now=None):
354 if now is None: now = time.time()
355 self.numfiles = numfiles
356 self.finished_files = 0
357 self.failed_files = 0
358 self.open_files = 0
359 self.total_size = total_size
360 self.failed_size = 0
361 self.start_time = now
362 self.finished_file_size = 0
363 self.last_update_time = now
364 self.re.start(total_size, now)
365 self._do_start(now)
366
367 def _do_start(self, now):
368 pass
369
370 def end(self, now=None):
371 if now is None: now = time.time()
372 self._do_end(now)
373
374 def _do_end(self, now):
375 pass
376
377 def lock(self): self._lock.acquire()
378 def unlock(self): self._lock.release()
379
380 ###########################################################
381 # child meter creation and destruction
382 def newMeter(self):
383 newmeter = self.helperclass(self)
384 self.meters.append(newmeter)
385 return newmeter
386
387 def removeMeter(self, meter):
388 self.meters.remove(meter)
389
390 ###########################################################
391 # child functions - these should only be called by helpers
392 def start_meter(self, meter, now):
393 if not meter in self.meters:
394 raise ValueError('attempt to use orphaned meter')
395 self._lock.acquire()
396 try:
397 if not meter in self.in_progress_meters:
398 self.in_progress_meters.append(meter)
399 self.open_files += 1
400 finally:
401 self._lock.release()
402 self._do_start_meter(meter, now)
403
404 def _do_start_meter(self, meter, now):
405 pass
406
407 def update_meter(self, meter, now):
408 if not meter in self.meters:
409 raise ValueError('attempt to use orphaned meter')
410 if (now >= self.last_update_time + self.update_period) or \
411 not self.last_update_time:
412 self.re.update(self._amount_read(), now)
413 self.last_update_time = now
414 self._do_update_meter(meter, now)
415
416 def _do_update_meter(self, meter, now):
417 pass
418
419 def end_meter(self, meter, now):
420 if not meter in self.meters:
421 raise ValueError('attempt to use orphaned meter')
422 self._lock.acquire()
423 try:
424 try: self.in_progress_meters.remove(meter)
425 except ValueError: pass
426 self.open_files -= 1
427 self.finished_files += 1
428 self.finished_file_size += meter.last_amount_read
429 finally:
430 self._lock.release()
431 self._do_end_meter(meter, now)
432
433 def _do_end_meter(self, meter, now):
434 pass
435
436 def failure_meter(self, meter, message, now):
437 if not meter in self.meters:
438 raise ValueError('attempt to use orphaned meter')
439 self._lock.acquire()
440 try:
441 try: self.in_progress_meters.remove(meter)
442 except ValueError: pass
443 self.open_files -= 1
444 self.failed_files += 1
445 if meter.size and self.failed_size is not None:
446 self.failed_size += meter.size
447 else:
448 self.failed_size = None
449 finally:
450 self._lock.release()
451 self._do_failure_meter(meter, message, now)
452
453 def _do_failure_meter(self, meter, message, now):
454 pass
455
456 def message_meter(self, meter, message):
457 pass
458
459 ########################################################
460 # internal functions
461 def _amount_read(self):
462 tot = self.finished_file_size
463 for m in self.in_progress_meters:
464 tot += m.last_amount_read
465 return tot
466
467
468class TextMultiFileMeter(MultiFileMeter):
469 def __init__(self, fo=sys.stderr):
470 self.fo = fo
471 MultiFileMeter.__init__(self)
472
473 # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
474 def _do_update_meter(self, meter, now):
475 self._lock.acquire()
476 try:
477 format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
478 "time: %8.8s/%8.8s"
479 df = self.finished_files
480 tf = self.numfiles or 1
481 pf = 100 * float(df)/tf + 0.49
482 dd = self.re.last_amount_read
483 td = self.total_size
484 pd = 100 * (self.re.fraction_read() or 0) + 0.49
485 dt = self.re.elapsed_time()
486 rt = self.re.remaining_time()
487 if rt is None: tt = None
488 else: tt = dt + rt
489
490 fdd = format_number(dd) + 'B'
491 ftd = format_number(td) + 'B'
492 fdt = format_time(dt, 1)
493 ftt = format_time(tt, 1)
494
495 out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
496 self.fo.write('\r' + out)
497 self.fo.flush()
498 finally:
499 self._lock.release()
500
501 def _do_end_meter(self, meter, now):
502 self._lock.acquire()
503 try:
504 format = "%-30.30s %6.6s %8.8s %9.9s"
505 fn = meter.basename
506 size = meter.last_amount_read
507 fsize = format_number(size) + 'B'
508 et = meter.re.elapsed_time()
509 fet = format_time(et, 1)
510 frate = format_number(size / et) + 'B/s'
511
512 out = '%-79.79s' % (format % (fn, fsize, fet, frate))
513 self.fo.write('\r' + out + '\n')
514 finally:
515 self._lock.release()
516 self._do_update_meter(meter, now)
517
518 def _do_failure_meter(self, meter, message, now):
519 self._lock.acquire()
520 try:
521 format = "%-30.30s %6.6s %s"
522 fn = meter.basename
523 if type(message) in (type(''), type(u'')):
524 message = message.splitlines()
525 if not message: message = ['']
526 out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
527 self.fo.write('\r' + out + '\n')
528 for m in message[1:]: self.fo.write(' ' + m + '\n')
529 self._lock.release()
530 finally:
531 self._do_update_meter(meter, now)
532
533 def message_meter(self, meter, message):
534 self._lock.acquire()
535 try:
536 pass
537 finally:
538 self._lock.release()
539
540 def _do_end(self, now):
541 self._do_update_meter(None, now)
542 self._lock.acquire()
543 try:
544 self.fo.write('\n')
545 self.fo.flush()
546 finally:
547 self._lock.release()
548
549######################################################################
550# support classes and functions
551
552class RateEstimator:
553 def __init__(self, timescale=5.0):
554 self.timescale = timescale
555
556 def start(self, total=None, now=None):
557 if now is None: now = time.time()
558 self.total = total
559 self.start_time = now
560 self.last_update_time = now
561 self.last_amount_read = 0
562 self.ave_rate = None
563
564 def update(self, amount_read, now=None):
565 if now is None: now = time.time()
566 if amount_read == 0:
567 # if we just started this file, all bets are off
568 self.last_update_time = now
569 self.last_amount_read = 0
570 self.ave_rate = None
571 return
572
573 #print 'times', now, self.last_update_time
574 time_diff = now - self.last_update_time
575 read_diff = amount_read - self.last_amount_read
576 # First update, on reget is the file size
577 if self.last_amount_read:
578 self.last_update_time = now
579 self.ave_rate = self._temporal_rolling_ave(\
580 time_diff, read_diff, self.ave_rate, self.timescale)
581 self.last_amount_read = amount_read
582 #print 'results', time_diff, read_diff, self.ave_rate
583
584 #####################################################################
585 # result methods
586 def average_rate(self):
587 "get the average transfer rate (in bytes/second)"
588 return self.ave_rate
589
590 def elapsed_time(self):
591 "the time between the start of the transfer and the most recent update"
592 return self.last_update_time - self.start_time
593
594 def remaining_time(self):
595 "estimated time remaining"
596 if not self.ave_rate or not self.total: return None
597 return (self.total - self.last_amount_read) / self.ave_rate
598
599 def fraction_read(self):
600 """the fraction of the data that has been read
601 (can be None for unknown transfer size)"""
602 if self.total is None: return None
603 elif self.total == 0: return 1.0
604 else: return float(self.last_amount_read)/self.total
605
606 #########################################################################
607 # support methods
608 def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale):
609 """a temporal rolling average performs smooth averaging even when
610 updates come at irregular intervals. This is performed by scaling
611 the "epsilon" according to the time since the last update.
612 Specifically, epsilon = time_diff / timescale
613
614 As a general rule, the average will take on a completely new value
615 after 'timescale' seconds."""
616 epsilon = time_diff / timescale
617 if epsilon > 1: epsilon = 1.0
618 return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
619
620 def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
621 """perform a "rolling average" iteration
622 a rolling average "folds" new data into an existing average with
623 some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive)
624 a value of 0.0 means only the old value (initial value) counts,
625 and a value of 1.0 means only the newest value is considered."""
626
627 try:
628 recent_rate = read_diff / time_diff
629 except ZeroDivisionError:
630 recent_rate = None
631 if last_ave is None: return recent_rate
632 elif recent_rate is None: return last_ave
633
634 # at this point, both last_ave and recent_rate are numbers
635 return epsilon * recent_rate + (1 - epsilon) * last_ave
636
637 def _round_remaining_time(self, rt, start_time=15.0):
638 """round the remaining time, depending on its size
639 If rt is between n*start_time and (n+1)*start_time round downward
640 to the nearest multiple of n (for any counting number n).
641 If rt < start_time, round down to the nearest 1.
642 For example (for start_time = 15.0):
643 2.7 -> 2.0
644 25.2 -> 25.0
645 26.4 -> 26.0
646 35.3 -> 34.0
647 63.6 -> 60.0
648 """
649
650 if rt < 0: return 0.0
651 shift = int(math.log(rt/start_time)/math.log(2))
652 rt = int(rt)
653 if shift <= 0: return rt
654 return float(int(rt) >> shift << shift)
655
656
657def format_time(seconds, use_hours=0):
658 if seconds is None or seconds < 0:
659 if use_hours: return '--:--:--'
660 else: return '--:--'
661 else:
662 seconds = int(seconds)
663 minutes = seconds / 60
664 seconds = seconds % 60
665 if use_hours:
666 hours = minutes / 60
667 minutes = minutes % 60
668 return '%02i:%02i:%02i' % (hours, minutes, seconds)
669 else:
670 return '%02i:%02i' % (minutes, seconds)
671
672def format_number(number, SI=0, space=' '):
673 """Turn numbers into human-readable metric-like numbers"""
674 symbols = ['', # (none)
675 'k', # kilo
676 'M', # mega
677 'G', # giga
678 'T', # tera
679 'P', # peta
680 'E', # exa
681 'Z', # zetta
682 'Y'] # yotta
683
684 if SI: step = 1000.0
685 else: step = 1024.0
686
687 thresh = 999
688 depth = 0
689 max_depth = len(symbols) - 1
690
691 # we want numbers between 0 and thresh, but don't exceed the length
692 # of our list. In that event, the formatting will be screwed up,
693 # but it'll still show the right number.
694 while number > thresh and depth < max_depth:
695 depth = depth + 1
696 number = number / step
697
698 if type(number) == type(1) or type(number) == type(1L):
699 # it's an int or a long, which means it didn't get divided,
700 # which means it's already short enough
701 format = '%i%s%s'
702 elif number < 9.95:
703 # must use 9.95 for proper sizing. For example, 9.99 will be
704 # rounded to 10.0 with the .1f format string (which is too long)
705 format = '%.1f%s%s'
706 else:
707 format = '%.0f%s%s'
708
709 return(format % (float(number or 0), space, symbols[depth]))
710
711def _tst(fn, cur, tot, beg, size, *args):
712 tm = TextMeter()
713 text = "(%d/%d): %s" % (cur, tot, fn)
714 tm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, text=text)
715 num = beg
716 off = 0
717 for (inc, delay) in args:
718 off += 1
719 while num < ((size * off) / len(args)):
720 num += inc
721 tm.update(num)
722 time.sleep(delay)
723 tm.end(size)
724
725if __name__ == "__main__":
726 # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
727 # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
728 if len(sys.argv) >= 2 and sys.argv[1] == 'total':
729 text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 +
730 1000000 + 10000 + 10000 + 10000 + 1000000)
731 _tst("sm-1.0.0-1.fc8.i386.rpm", 1, 10, 0, 1000,
732 (10, 0.2), (10, 0.1), (100, 0.25))
733 _tst("s-1.0.1-1.fc8.i386.rpm", 2, 10, 0, 10000,
734 (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
735 _tst("m-1.0.1-2.fc8.i386.rpm", 3, 10, 5000, 10000,
736 (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
737 _tst("large-file-name-Foo-11.8.7-4.5.6.1.fc8.x86_64.rpm", 4, 10, 0, 1000000,
738 (1000, 0.2), (1000, 0.1), (10000, 0.1))
739 _tst("large-file-name-Foo2-11.8.7-4.5.6.2.fc8.x86_64.rpm", 5, 10,
740 500001, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1))
741 _tst("large-file-name-Foo3-11.8.7-4.5.6.3.fc8.x86_64.rpm", 6, 10,
742 750002, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1))
743 _tst("large-file-name-Foo4-10.8.7-4.5.6.1.fc8.x86_64.rpm", 7, 10, 0, 10000,
744 (100, 0.1))
745 _tst("large-file-name-Foo5-10.8.7-4.5.6.2.fc8.x86_64.rpm", 8, 10,
746 5001, 10000, (100, 0.1))
747 _tst("large-file-name-Foo6-10.8.7-4.5.6.3.fc8.x86_64.rpm", 9, 10,
748 7502, 10000, (1, 0.1))
749 _tst("large-file-name-Foox-9.8.7-4.5.6.1.fc8.x86_64.rpm", 10, 10,
750 0, 1000000, (10, 0.5),
751 (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
752 (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
753 (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
754 (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
755 (100000, 0.1), (1, 0.1))
7560
=== removed directory '.pc/progress_object_callback_fix.diff'
=== removed directory '.pc/progress_object_callback_fix.diff/urlgrabber'
=== removed file '.pc/progress_object_callback_fix.diff/urlgrabber/grabber.py'
--- .pc/progress_object_callback_fix.diff/urlgrabber/grabber.py 2011-08-09 17:45:08 +0000
+++ .pc/progress_object_callback_fix.diff/urlgrabber/grabber.py 1970-01-01 00:00:00 +0000
@@ -1,1802 +0,0 @@
1# This library is free software; you can redistribute it and/or
2# modify it under the terms of the GNU Lesser General Public
3# License as published by the Free Software Foundation; either
4# version 2.1 of the License, or (at your option) any later version.
5#
6# This library is distributed in the hope that it will be useful,
7# but WITHOUT ANY WARRANTY; without even the implied warranty of
8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9# Lesser General Public License for more details.
10#
11# You should have received a copy of the GNU Lesser General Public
12# License along with this library; if not, write to the
13# Free Software Foundation, Inc.,
14# 59 Temple Place, Suite 330,
15# Boston, MA 02111-1307 USA
16
17# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
20
21"""A high-level cross-protocol url-grabber.
22
23GENERAL ARGUMENTS (kwargs)
24
25 Where possible, the module-level default is indicated, and legal
26 values are provided.
27
28 copy_local = 0 [0|1]
29
30 ignored except for file:// urls, in which case it specifies
31 whether urlgrab should still make a copy of the file, or simply
32 point to the existing copy. The module level default for this
33 option is 0.
34
35 close_connection = 0 [0|1]
36
37 tells URLGrabber to close the connection after a file has been
38 transfered. This is ignored unless the download happens with the
39 http keepalive handler (keepalive=1). Otherwise, the connection
40 is left open for further use. The module level default for this
41 option is 0 (keepalive connections will not be closed).
42
43 keepalive = 1 [0|1]
44
45 specifies whether keepalive should be used for HTTP/1.1 servers
46 that support it. The module level default for this option is 1
47 (keepalive is enabled).
48
49 progress_obj = None
50
51 a class instance that supports the following methods:
52 po.start(filename, url, basename, length, text)
53 # length will be None if unknown
54 po.update(read) # read == bytes read so far
55 po.end()
56
57 text = None
58
59 specifies alternative text to be passed to the progress meter
60 object. If not given, the default progress meter will use the
61 basename of the file.
62
63 throttle = 1.0
64
65 a number - if it's an int, it's the bytes/second throttle limit.
66 If it's a float, it is first multiplied by bandwidth. If throttle
67 == 0, throttling is disabled. If None, the module-level default
68 (which can be set on default_grabber.throttle) is used. See
69 BANDWIDTH THROTTLING for more information.
70
71 timeout = 300
72
73 a positive integer expressing the number of seconds to wait before
74 timing out attempts to connect to a server. If the value is None
75 or 0, connection attempts will not time out. The timeout is passed
76 to the underlying pycurl object as its CONNECTTIMEOUT option, see
77 the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
78 http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
79
80 bandwidth = 0
81
82 the nominal max bandwidth in bytes/second. If throttle is a float
83 and bandwidth == 0, throttling is disabled. If None, the
84 module-level default (which can be set on
85 default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
86 more information.
87
88 range = None
89
90 a tuple of the form (first_byte, last_byte) describing a byte
91 range to retrieve. Either or both of the values may set to
92 None. If first_byte is None, byte offset 0 is assumed. If
93 last_byte is None, the last byte available is assumed. Note that
94 the range specification is python-like in that (0,10) will yeild
95 the first 10 bytes of the file.
96
97 If set to None, no range will be used.
98
99 reget = None [None|'simple'|'check_timestamp']
100
101 whether to attempt to reget a partially-downloaded file. Reget
102 only applies to .urlgrab and (obviously) only if there is a
103 partially downloaded file. Reget has two modes:
104
105 'simple' -- the local file will always be trusted. If there
106 are 100 bytes in the local file, then the download will always
107 begin 100 bytes into the requested file.
108
109 'check_timestamp' -- the timestamp of the server file will be
110 compared to the timestamp of the local file. ONLY if the
111 local file is newer than or the same age as the server file
112 will reget be used. If the server file is newer, or the
113 timestamp is not returned, the entire file will be fetched.
114
115 NOTE: urlgrabber can do very little to verify that the partial
116 file on disk is identical to the beginning of the remote file.
117 You may want to either employ a custom "checkfunc" or simply avoid
118 using reget in situations where corruption is a concern.
119
120 user_agent = 'urlgrabber/VERSION'
121
122 a string, usually of the form 'AGENT/VERSION' that is provided to
123 HTTP servers in the User-agent header. The module level default
124 for this option is "urlgrabber/VERSION".
125
126 http_headers = None
127
128 a tuple of 2-tuples, each containing a header and value. These
129 will be used for http and https requests only. For example, you
130 can do
131 http_headers = (('Pragma', 'no-cache'),)
132
133 ftp_headers = None
134
135 this is just like http_headers, but will be used for ftp requests.
136
137 proxies = None
138
139 a dictionary that maps protocol schemes to proxy hosts. For
140 example, to use a proxy server on host "foo" port 3128 for http
141 and https URLs:
142 proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
143 note that proxy authentication information may be provided using
144 normal URL constructs:
145 proxies={ 'http' : 'http://user:host@foo:3128' }
146 Lastly, if proxies is None, the default environment settings will
147 be used.
148
149 prefix = None
150
151 a url prefix that will be prepended to all requested urls. For
152 example:
153 g = URLGrabber(prefix='http://foo.com/mirror/')
154 g.urlgrab('some/file.txt')
155 ## this will fetch 'http://foo.com/mirror/some/file.txt'
156 This option exists primarily to allow identical behavior to
157 MirrorGroup (and derived) instances. Note: a '/' will be inserted
158 if necessary, so you cannot specify a prefix that ends with a
159 partial file or directory name.
160
161 opener = None
162 No-op when using the curl backend (default)
163
164 cache_openers = True
165 No-op when using the curl backend (default)
166
167 data = None
168
169 Only relevant for the HTTP family (and ignored for other
170 protocols), this allows HTTP POSTs. When the data kwarg is
171 present (and not None), an HTTP request will automatically become
172 a POST rather than GET. This is done by direct passthrough to
173 urllib2. If you use this, you may also want to set the
174 'Content-length' and 'Content-type' headers with the http_headers
175 option. Note that python 2.2 handles the case of these
176 badly and if you do not use the proper case (shown here), your
177 values will be overridden with the defaults.
178
179 urlparser = URLParser()
180
181 The URLParser class handles pre-processing of URLs, including
182 auth-handling for user/pass encoded in http urls, file handing
183 (that is, filenames not sent as a URL), and URL quoting. If you
184 want to override any of this behavior, you can pass in a
185 replacement instance. See also the 'quote' option.
186
187 quote = None
188
189 Whether or not to quote the path portion of a url.
190 quote = 1 -> quote the URLs (they're not quoted yet)
191 quote = 0 -> do not quote them (they're already quoted)
192 quote = None -> guess what to do
193
194 This option only affects proper urls like 'file:///etc/passwd'; it
195 does not affect 'raw' filenames like '/etc/passwd'. The latter
196 will always be quoted as they are converted to URLs. Also, only
197 the path part of a url is quoted. If you need more fine-grained
198 control, you should probably subclass URLParser and pass it in via
199 the 'urlparser' option.
200
201 ssl_ca_cert = None
202
203 this option can be used if M2Crypto is available and will be
204 ignored otherwise. If provided, it will be used to create an SSL
205 context. If both ssl_ca_cert and ssl_context are provided, then
206 ssl_context will be ignored and a new context will be created from
207 ssl_ca_cert.
208
209 ssl_context = None
210
211 No-op when using the curl backend (default)
212
213
214 self.ssl_verify_peer = True
215
216 Check the server's certificate to make sure it is valid with what our CA validates
217
218 self.ssl_verify_host = True
219
220 Check the server's hostname to make sure it matches the certificate DN
221
222 self.ssl_key = None
223
224 Path to the key the client should use to connect/authenticate with
225
226 self.ssl_key_type = 'PEM'
227
228 PEM or DER - format of key
229
230 self.ssl_cert = None
231
232 Path to the ssl certificate the client should use to to authenticate with
233
234 self.ssl_cert_type = 'PEM'
235
236 PEM or DER - format of certificate
237
238 self.ssl_key_pass = None
239
240 password to access the ssl_key
241
242 self.size = None
243
244 size (in bytes) or Maximum size of the thing being downloaded.
245 This is mostly to keep us from exploding with an endless datastream
246
247 self.max_header_size = 2097152
248
249 Maximum size (in bytes) of the headers.
250
251
252RETRY RELATED ARGUMENTS
253
254 retry = None
255
256 the number of times to retry the grab before bailing. If this is
257 zero, it will retry forever. This was intentional... really, it
258 was :). If this value is not supplied or is supplied but is None
259 retrying does not occur.
260
261 retrycodes = [-1,2,4,5,6,7]
262
263 a sequence of errorcodes (values of e.errno) for which it should
264 retry. See the doc on URLGrabError for more details on this. You
265 might consider modifying a copy of the default codes rather than
266 building yours from scratch so that if the list is extended in the
267 future (or one code is split into two) you can still enjoy the
268 benefits of the default list. You can do that with something like
269 this:
270
271 retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
272 if 12 not in retrycodes:
273 retrycodes.append(12)
274
275 checkfunc = None
276
277 a function to do additional checks. This defaults to None, which
278 means no additional checking. The function should simply return
279 on a successful check. It should raise URLGrabError on an
280 unsuccessful check. Raising of any other exception will be
281 considered immediate failure and no retries will occur.
282
283 If it raises URLGrabError, the error code will determine the retry
284 behavior. Negative error numbers are reserved for use by these
285 passed in functions, so you can use many negative numbers for
286 different types of failure. By default, -1 results in a retry,
287 but this can be customized with retrycodes.
288
289 If you simply pass in a function, it will be given exactly one
290 argument: a CallbackObject instance with the .url attribute
291 defined and either .filename (for urlgrab) or .data (for urlread).
292 For urlgrab, .filename is the name of the local file. For
293 urlread, .data is the actual string data. If you need other
294 arguments passed to the callback (program state of some sort), you
295 can do so like this:
296
297 checkfunc=(function, ('arg1', 2), {'kwarg': 3})
298
299 if the downloaded file has filename /tmp/stuff, then this will
300 result in this call (for urlgrab):
301
302 function(obj, 'arg1', 2, kwarg=3)
303 # obj.filename = '/tmp/stuff'
304 # obj.url = 'http://foo.com/stuff'
305
306 NOTE: both the "args" tuple and "kwargs" dict must be present if
307 you use this syntax, but either (or both) can be empty.
308
309 failure_callback = None
310
311 The callback that gets called during retries when an attempt to
312 fetch a file fails. The syntax for specifying the callback is
313 identical to checkfunc, except for the attributes defined in the
314 CallbackObject instance. The attributes for failure_callback are:
315
316 exception = the raised exception
317 url = the url we're trying to fetch
318 tries = the number of tries so far (including this one)
319 retry = the value of the retry option
320
321 The callback is present primarily to inform the calling program of
322 the failure, but if it raises an exception (including the one it's
323 passed) that exception will NOT be caught and will therefore cause
324 future retries to be aborted.
325
326 The callback is called for EVERY failure, including the last one.
327 On the last try, the callback can raise an alternate exception,
328 but it cannot (without severe trickiness) prevent the exception
329 from being raised.
330
331 interrupt_callback = None
332
333 This callback is called if KeyboardInterrupt is received at any
334 point in the transfer. Basically, this callback can have three
335 impacts on the fetch process based on the way it exits:
336
337 1) raise no exception: the current fetch will be aborted, but
338 any further retries will still take place
339
340 2) raise a URLGrabError: if you're using a MirrorGroup, then
341 this will prompt a failover to the next mirror according to
342 the behavior of the MirrorGroup subclass. It is recommended
343 that you raise URLGrabError with code 15, 'user abort'. If
344 you are NOT using a MirrorGroup subclass, then this is the
345 same as (3).
346
347 3) raise some other exception (such as KeyboardInterrupt), which
348 will not be caught at either the grabber or mirror levels.
349 That is, it will be raised up all the way to the caller.
350
351 This callback is very similar to failure_callback. They are
352 passed the same arguments, so you could use the same function for
353 both.
354
355BANDWIDTH THROTTLING
356
357 urlgrabber supports throttling via two values: throttle and
358 bandwidth Between the two, you can either specify and absolute
359 throttle threshold or specify a theshold as a fraction of maximum
360 available bandwidth.
361
362 throttle is a number - if it's an int, it's the bytes/second
363 throttle limit. If it's a float, it is first multiplied by
364 bandwidth. If throttle == 0, throttling is disabled. If None, the
365 module-level default (which can be set with set_throttle) is used.
366
367 bandwidth is the nominal max bandwidth in bytes/second. If throttle
368 is a float and bandwidth == 0, throttling is disabled. If None, the
369 module-level default (which can be set with set_bandwidth) is used.
370
371 THROTTLING EXAMPLES:
372
373 Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
374 per second, or 12,500,000 Bytes per second. You have a number of
375 throttling options:
376
377 *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
378
379 This will limit urlgrab to use half of your available bandwidth.
380
381 *) set_throttle(6250000) # throttle is an int
382
383 This will also limit urlgrab to use half of your available
384 bandwidth, regardless of what bandwidth is set to.
385
386 *) set_throttle(6250000); set_throttle(1.0) # float
387
388 Use half your bandwidth
389
390 *) set_throttle(6250000); set_throttle(2.0) # float
391
392 Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
393
394 *) set_throttle(6250000); set_throttle(0) # throttle = 0
395
396 Disable throttling - this is more efficient than a very large
397 throttle setting.
398
399 *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
400
401 Disable throttling - this is the default when the module is loaded.
402
403 SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
404
405 While this is flexible, it's not extremely obvious to the user. I
406 suggest you implement a float throttle as a percent to make the
407 distinction between absolute and relative throttling very explicit.
408
409 Also, you may want to convert the units to something more convenient
410 than bytes/second, such as kbps or kB/s, etc.
411
412"""
413
414
415
416import os
417import sys
418import urlparse
419import time
420import string
421import urllib
422import urllib2
423import mimetools
424import thread
425import types
426import stat
427import pycurl
428from ftplib import parse150
429from StringIO import StringIO
430from httplib import HTTPException
431import socket
432from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
433
434########################################################################
435# MODULE INITIALIZATION
436########################################################################
437try:
438 exec('from ' + (__name__.split('.'))[0] + ' import __version__')
439except:
440 __version__ = '???'
441
442try:
443 # this part isn't going to do much - need to talk to gettext
444 from i18n import _
445except ImportError, msg:
446 def _(st): return st
447
448########################################################################
449# functions for debugging output. These functions are here because they
450# are also part of the module initialization.
451DEBUG = None
452def set_logger(DBOBJ):
453 """Set the DEBUG object. This is called by _init_default_logger when
454 the environment variable URLGRABBER_DEBUG is set, but can also be
455 called by a calling program. Basically, if the calling program uses
456 the logging module and would like to incorporate urlgrabber logging,
457 then it can do so this way. It's probably not necessary as most
458 internal logging is only for debugging purposes.
459
460 The passed-in object should be a logging.Logger instance. It will
461 be pushed into the keepalive and byterange modules if they're
462 being used. The mirror module pulls this object in on import, so
463 you will need to manually push into it. In fact, you may find it
464 tidier to simply push your logging object (or objects) into each
465 of these modules independently.
466 """
467
468 global DEBUG
469 DEBUG = DBOBJ
470
471def _init_default_logger(logspec=None):
472 '''Examines the environment variable URLGRABBER_DEBUG and creates
473 a logging object (logging.logger) based on the contents. It takes
474 the form
475
476 URLGRABBER_DEBUG=level,filename
477
478 where "level" can be either an integer or a log level from the
479 logging module (DEBUG, INFO, etc). If the integer is zero or
480 less, logging will be disabled. Filename is the filename where
481 logs will be sent. If it is "-", then stdout will be used. If
482 the filename is empty or missing, stderr will be used. If the
483 variable cannot be processed or the logging module cannot be
484 imported (python < 2.3) then logging will be disabled. Here are
485 some examples:
486
487 URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
488 URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
489 URLGRABBER_DEBUG=INFO # log info and higher to stderr
490
491 This funtion is called during module initialization. It is not
492 intended to be called from outside. The only reason it is a
493 function at all is to keep the module-level namespace tidy and to
494 collect the code into a nice block.'''
495
496 try:
497 if logspec is None:
498 logspec = os.environ['URLGRABBER_DEBUG']
499 dbinfo = logspec.split(',')
500 import logging
501 level = logging._levelNames.get(dbinfo[0], None)
502 if level is None: level = int(dbinfo[0])
503 if level < 1: raise ValueError()
504
505 formatter = logging.Formatter('%(asctime)s %(message)s')
506 if len(dbinfo) > 1: filename = dbinfo[1]
507 else: filename = ''
508 if filename == '': handler = logging.StreamHandler(sys.stderr)
509 elif filename == '-': handler = logging.StreamHandler(sys.stdout)
510 else: handler = logging.FileHandler(filename)
511 handler.setFormatter(formatter)
512 DBOBJ = logging.getLogger('urlgrabber')
513 DBOBJ.addHandler(handler)
514 DBOBJ.setLevel(level)
515 except (KeyError, ImportError, ValueError):
516 DBOBJ = None
517 set_logger(DBOBJ)
518
519def _log_package_state():
520 if not DEBUG: return
521 DEBUG.info('urlgrabber version = %s' % __version__)
522 DEBUG.info('trans function "_" = %s' % _)
523
524_init_default_logger()
525_log_package_state()
526
527
528# normally this would be from i18n or something like it ...
529def _(st):
530 return st
531
532########################################################################
533# END MODULE INITIALIZATION
534########################################################################
535
536
537
538class URLGrabError(IOError):
539 """
540 URLGrabError error codes:
541
542 URLGrabber error codes (0 -- 255)
543 0 - everything looks good (you should never see this)
544 1 - malformed url
545 2 - local file doesn't exist
546 3 - request for non-file local file (dir, etc)
547 4 - IOError on fetch
548 5 - OSError on fetch
549 6 - no content length header when we expected one
550 7 - HTTPException
551 8 - Exceeded read limit (for urlread)
552 9 - Requested byte range not satisfiable.
553 10 - Byte range requested, but range support unavailable
554 11 - Illegal reget mode
555 12 - Socket timeout
556 13 - malformed proxy url
557 14 - HTTPError (includes .code and .exception attributes)
558 15 - user abort
559 16 - error writing to local file
560
561 MirrorGroup error codes (256 -- 511)
562 256 - No more mirrors left to try
563
564 Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
565 [ this range reserved for application-specific error codes ]
566
567 Retry codes (< 0)
568 -1 - retry the download, unknown reason
569
570 Note: to test which group a code is in, you can simply do integer
571 division by 256: e.errno / 256
572
573 Negative codes are reserved for use by functions passed in to
574 retrygrab with checkfunc. The value -1 is built in as a generic
575 retry code and is already included in the retrycodes list.
576 Therefore, you can create a custom check function that simply
577 returns -1 and the fetch will be re-tried. For more customized
578 retries, you can use other negative number and include them in
579 retry-codes. This is nice for outputting useful messages about
580 what failed.
581
582 You can use these error codes like so:
583 try: urlgrab(url)
584 except URLGrabError, e:
585 if e.errno == 3: ...
586 # or
587 print e.strerror
588 # or simply
589 print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
590 """
591 def __init__(self, *args):
592 IOError.__init__(self, *args)
593 self.url = "No url specified"
594
595class CallbackObject:
596 """Container for returned callback data.
597
598 This is currently a dummy class into which urlgrabber can stuff
599 information for passing to callbacks. This way, the prototype for
600 all callbacks is the same, regardless of the data that will be
601 passed back. Any function that accepts a callback function as an
602 argument SHOULD document what it will define in this object.
603
604 It is possible that this class will have some greater
605 functionality in the future.
606 """
607 def __init__(self, **kwargs):
608 self.__dict__.update(kwargs)
609
610def urlgrab(url, filename=None, **kwargs):
611 """grab the file at <url> and make a local copy at <filename>
612 If filename is none, the basename of the url is used.
613 urlgrab returns the filename of the local file, which may be different
614 from the passed-in filename if the copy_local kwarg == 0.
615
616 See module documentation for a description of possible kwargs.
617 """
618 return default_grabber.urlgrab(url, filename, **kwargs)
619
620def urlopen(url, **kwargs):
621 """open the url and return a file object
622 If a progress object or throttle specifications exist, then
623 a special file object will be returned that supports them.
624 The file object can be treated like any other file object.
625
626 See module documentation for a description of possible kwargs.
627 """
628 return default_grabber.urlopen(url, **kwargs)
629
630def urlread(url, limit=None, **kwargs):
631 """read the url into a string, up to 'limit' bytes
632 If the limit is exceeded, an exception will be thrown. Note that urlread
633 is NOT intended to be used as a way of saying "I want the first N bytes"
634 but rather 'read the whole file into memory, but don't use too much'
635
636 See module documentation for a description of possible kwargs.
637 """
638 return default_grabber.urlread(url, limit, **kwargs)
639
640
641class URLParser:
642 """Process the URLs before passing them to urllib2.
643
644 This class does several things:
645
646 * add any prefix
647 * translate a "raw" file to a proper file: url
648 * handle any http or https auth that's encoded within the url
649 * quote the url
650
651 Only the "parse" method is called directly, and it calls sub-methods.
652
653 An instance of this class is held in the options object, which
654 means that it's easy to change the behavior by sub-classing and
655 passing the replacement in. It need only have a method like:
656
657 url, parts = urlparser.parse(url, opts)
658 """
659
660 def parse(self, url, opts):
661 """parse the url and return the (modified) url and its parts
662
663 Note: a raw file WILL be quoted when it's converted to a URL.
664 However, other urls (ones which come with a proper scheme) may
665 or may not be quoted according to opts.quote
666
667 opts.quote = 1 --> quote it
668 opts.quote = 0 --> do not quote it
669 opts.quote = None --> guess
670 """
671 quote = opts.quote
672
673 if opts.prefix:
674 url = self.add_prefix(url, opts.prefix)
675
676 parts = urlparse.urlparse(url)
677 (scheme, host, path, parm, query, frag) = parts
678
679 if not scheme or (len(scheme) == 1 and scheme in string.letters):
680 # if a scheme isn't specified, we guess that it's "file:"
681 if url[0] not in '/\\': url = os.path.abspath(url)
682 url = 'file:' + urllib.pathname2url(url)
683 parts = urlparse.urlparse(url)
684 quote = 0 # pathname2url quotes, so we won't do it again
685
686 if scheme in ['http', 'https']:
687 parts = self.process_http(parts, url)
688
689 if quote is None:
690 quote = self.guess_should_quote(parts)
691 if quote:
692 parts = self.quote(parts)
693
694 url = urlparse.urlunparse(parts)
695 return url, parts
696
697 def add_prefix(self, url, prefix):
698 if prefix[-1] == '/' or url[0] == '/':
699 url = prefix + url
700 else:
701 url = prefix + '/' + url
702 return url
703
704 def process_http(self, parts, url):
705 (scheme, host, path, parm, query, frag) = parts
706 # TODO: auth-parsing here, maybe? pycurl doesn't really need it
707 return (scheme, host, path, parm, query, frag)
708
709 def quote(self, parts):
710 """quote the URL
711
712 This method quotes ONLY the path part. If you need to quote
713 other parts, you should override this and pass in your derived
714 class. The other alternative is to quote other parts before
715 passing into urlgrabber.
716 """
717 (scheme, host, path, parm, query, frag) = parts
718 path = urllib.quote(path)
719 return (scheme, host, path, parm, query, frag)
720
721 hexvals = '0123456789ABCDEF'
722 def guess_should_quote(self, parts):
723 """
724 Guess whether we should quote a path. This amounts to
725 guessing whether it's already quoted.
726
727 find ' ' -> 1
728 find '%' -> 1
729 find '%XX' -> 0
730 else -> 1
731 """
732 (scheme, host, path, parm, query, frag) = parts
733 if ' ' in path:
734 return 1
735 ind = string.find(path, '%')
736 if ind > -1:
737 while ind > -1:
738 if len(path) < ind+3:
739 return 1
740 code = path[ind+1:ind+3].upper()
741 if code[0] not in self.hexvals or \
742 code[1] not in self.hexvals:
743 return 1
744 ind = string.find(path, '%', ind+1)
745 return 0
746 return 1
747
748class URLGrabberOptions:
749 """Class to ease kwargs handling."""
750
751 def __init__(self, delegate=None, **kwargs):
752 """Initialize URLGrabberOptions object.
753 Set default values for all options and then update options specified
754 in kwargs.
755 """
756 self.delegate = delegate
757 if delegate is None:
758 self._set_defaults()
759 self._set_attributes(**kwargs)
760
761 def __getattr__(self, name):
762 if self.delegate and hasattr(self.delegate, name):
763 return getattr(self.delegate, name)
764 raise AttributeError, name
765
766 def raw_throttle(self):
767 """Calculate raw throttle value from throttle and bandwidth
768 values.
769 """
770 if self.throttle <= 0:
771 return 0
772 elif type(self.throttle) == type(0):
773 return float(self.throttle)
774 else: # throttle is a float
775 return self.bandwidth * self.throttle
776
777 def derive(self, **kwargs):
778 """Create a derived URLGrabberOptions instance.
779 This method creates a new instance and overrides the
780 options specified in kwargs.
781 """
782 return URLGrabberOptions(delegate=self, **kwargs)
783
784 def _set_attributes(self, **kwargs):
785 """Update object attributes with those provided in kwargs."""
786 self.__dict__.update(kwargs)
787 if kwargs.has_key('range'):
788 # normalize the supplied range value
789 self.range = range_tuple_normalize(self.range)
790 if not self.reget in [None, 'simple', 'check_timestamp']:
791 raise URLGrabError(11, _('Illegal reget mode: %s') \
792 % (self.reget, ))
793
794 def _set_defaults(self):
795 """Set all options to their default values.
796 When adding new options, make sure a default is
797 provided here.
798 """
799 self.progress_obj = None
800 self.throttle = 1.0
801 self.bandwidth = 0
802 self.retry = None
803 self.retrycodes = [-1,2,4,5,6,7]
804 self.checkfunc = None
805 self.copy_local = 0
806 self.close_connection = 0
807 self.range = None
808 self.user_agent = 'urlgrabber/%s' % __version__
809 self.keepalive = 1
810 self.proxies = None
811 self.reget = None
812 self.failure_callback = None
813 self.interrupt_callback = None
814 self.prefix = None
815 self.opener = None
816 self.cache_openers = True
817 self.timeout = 300
818 self.text = None
819 self.http_headers = None
820 self.ftp_headers = None
821 self.data = None
822 self.urlparser = URLParser()
823 self.quote = None
824 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
825 self.ssl_context = None # no-op in pycurl
826 self.ssl_verify_peer = True # check peer's cert for authenticityb
827 self.ssl_verify_host = True # make sure who they are and who the cert is for matches
828 self.ssl_key = None # client key
829 self.ssl_key_type = 'PEM' #(or DER)
830 self.ssl_cert = None # client cert
831 self.ssl_cert_type = 'PEM' # (or DER)
832 self.ssl_key_pass = None # password to access the key
833 self.size = None # if we know how big the thing we're getting is going
834 # to be. this is ultimately a MAXIMUM size for the file
835 self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
836
837 def __repr__(self):
838 return self.format()
839
840 def format(self, indent=' '):
841 keys = self.__dict__.keys()
842 if self.delegate is not None:
843 keys.remove('delegate')
844 keys.sort()
845 s = '{\n'
846 for k in keys:
847 s = s + indent + '%-15s: %s,\n' % \
848 (repr(k), repr(self.__dict__[k]))
849 if self.delegate:
850 df = self.delegate.format(indent + ' ')
851 s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
852 s = s + indent + '}'
853 return s
854
855class URLGrabber:
856 """Provides easy opening of URLs with a variety of options.
857
858 All options are specified as kwargs. Options may be specified when
859 the class is created and may be overridden on a per request basis.
860
861 New objects inherit default values from default_grabber.
862 """
863
864 def __init__(self, **kwargs):
865 self.opts = URLGrabberOptions(**kwargs)
866
867 def _retry(self, opts, func, *args):
868 tries = 0
869 while 1:
870 # there are only two ways out of this loop. The second has
871 # several "sub-ways"
872 # 1) via the return in the "try" block
873 # 2) by some exception being raised
874 # a) an excepton is raised that we don't "except"
875 # b) a callback raises ANY exception
876 # c) we're not retry-ing or have run out of retries
877 # d) the URLGrabError code is not in retrycodes
878 # beware of infinite loops :)
879 tries = tries + 1
880 exception = None
881 retrycode = None
882 callback = None
883 if DEBUG: DEBUG.info('attempt %i/%s: %s',
884 tries, opts.retry, args[0])
885 try:
886 r = apply(func, (opts,) + args, {})
887 if DEBUG: DEBUG.info('success')
888 return r
889 except URLGrabError, e:
890 exception = e
891 callback = opts.failure_callback
892 retrycode = e.errno
893 except KeyboardInterrupt, e:
894 exception = e
895 callback = opts.interrupt_callback
896
897 if DEBUG: DEBUG.info('exception: %s', exception)
898 if callback:
899 if DEBUG: DEBUG.info('calling callback: %s', callback)
900 cb_func, cb_args, cb_kwargs = self._make_callback(callback)
901 obj = CallbackObject(exception=exception, url=args[0],
902 tries=tries, retry=opts.retry)
903 cb_func(obj, *cb_args, **cb_kwargs)
904
905 if (opts.retry is None) or (tries == opts.retry):
906 if DEBUG: DEBUG.info('retries exceeded, re-raising')
907 raise
908
909 if (retrycode is not None) and (retrycode not in opts.retrycodes):
910 if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
911 retrycode, opts.retrycodes)
912 raise
913
914 def urlopen(self, url, **kwargs):
915 """open the url and return a file object
916 If a progress object or throttle value specified when this
917 object was created, then a special file object will be
918 returned that supports them. The file object can be treated
919 like any other file object.
920 """
921 opts = self.opts.derive(**kwargs)
922 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
923 (url,parts) = opts.urlparser.parse(url, opts)
924 def retryfunc(opts, url):
925 return PyCurlFileObject(url, filename=None, opts=opts)
926 return self._retry(opts, retryfunc, url)
927
928 def urlgrab(self, url, filename=None, **kwargs):
929 """grab the file at <url> and make a local copy at <filename>
930 If filename is none, the basename of the url is used.
931 urlgrab returns the filename of the local file, which may be
932 different from the passed-in filename if copy_local == 0.
933 """
934 opts = self.opts.derive(**kwargs)
935 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
936 (url,parts) = opts.urlparser.parse(url, opts)
937 (scheme, host, path, parm, query, frag) = parts
938 if filename is None:
939 filename = os.path.basename( urllib.unquote(path) )
940 if scheme == 'file' and not opts.copy_local:
941 # just return the name of the local file - don't make a
942 # copy currently
943 path = urllib.url2pathname(path)
944 if host:
945 path = os.path.normpath('//' + host + path)
946 if not os.path.exists(path):
947 err = URLGrabError(2,
948 _('Local file does not exist: %s') % (path, ))
949 err.url = url
950 raise err
951 elif not os.path.isfile(path):
952 err = URLGrabError(3,
953 _('Not a normal file: %s') % (path, ))
954 err.url = url
955 raise err
956
957 elif not opts.range:
958 if not opts.checkfunc is None:
959 cb_func, cb_args, cb_kwargs = \
960 self._make_callback(opts.checkfunc)
961 obj = CallbackObject()
962 obj.filename = path
963 obj.url = url
964 apply(cb_func, (obj, )+cb_args, cb_kwargs)
965 return path
966
967 def retryfunc(opts, url, filename):
968 fo = PyCurlFileObject(url, filename, opts)
969 try:
970 fo._do_grab()
971 if not opts.checkfunc is None:
972 cb_func, cb_args, cb_kwargs = \
973 self._make_callback(opts.checkfunc)
974 obj = CallbackObject()
975 obj.filename = filename
976 obj.url = url
977 apply(cb_func, (obj, )+cb_args, cb_kwargs)
978 finally:
979 fo.close()
980 return filename
981
982 return self._retry(opts, retryfunc, url, filename)
983
984 def urlread(self, url, limit=None, **kwargs):
985 """read the url into a string, up to 'limit' bytes
986 If the limit is exceeded, an exception will be thrown. Note
987 that urlread is NOT intended to be used as a way of saying
988 "I want the first N bytes" but rather 'read the whole file
989 into memory, but don't use too much'
990 """
991 opts = self.opts.derive(**kwargs)
992 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
993 (url,parts) = opts.urlparser.parse(url, opts)
994 if limit is not None:
995 limit = limit + 1
996
997 def retryfunc(opts, url, limit):
998 fo = PyCurlFileObject(url, filename=None, opts=opts)
999 s = ''
1000 try:
1001 # this is an unfortunate thing. Some file-like objects
1002 # have a default "limit" of None, while the built-in (real)
1003 # file objects have -1. They each break the other, so for
1004 # now, we just force the default if necessary.
1005 if limit is None: s = fo.read()
1006 else: s = fo.read(limit)
1007
1008 if not opts.checkfunc is None:
1009 cb_func, cb_args, cb_kwargs = \
1010 self._make_callback(opts.checkfunc)
1011 obj = CallbackObject()
1012 obj.data = s
1013 obj.url = url
1014 apply(cb_func, (obj, )+cb_args, cb_kwargs)
1015 finally:
1016 fo.close()
1017 return s
1018
1019 s = self._retry(opts, retryfunc, url, limit)
1020 if limit and len(s) > limit:
1021 err = URLGrabError(8,
1022 _('Exceeded limit (%i): %s') % (limit, url))
1023 err.url = url
1024 raise err
1025
1026 return s
1027
1028 def _make_callback(self, callback_obj):
1029 if callable(callback_obj):
1030 return callback_obj, (), {}
1031 else:
1032 return callback_obj
1033
1034# create the default URLGrabber used by urlXXX functions.
1035# NOTE: actual defaults are set in URLGrabberOptions
1036default_grabber = URLGrabber()
1037
1038
1039class PyCurlFileObject():
1040 def __init__(self, url, filename, opts):
1041 self.fo = None
1042 self._hdr_dump = ''
1043 self._parsed_hdr = None
1044 self.url = url
1045 self.scheme = urlparse.urlsplit(self.url)[0]
1046 self.filename = filename
1047 self.append = False
1048 self.reget_time = None
1049 self.opts = opts
1050 if self.opts.reget == 'check_timestamp':
1051 raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
1052 self._complete = False
1053 self._rbuf = ''
1054 self._rbufsize = 1024*8
1055 self._ttime = time.time()
1056 self._tsize = 0
1057 self._amount_read = 0
1058 self._reget_length = 0
1059 self._prog_running = False
1060 self._error = (None, None)
1061 self.size = 0
1062 self._hdr_ended = False
1063 self._do_open()
1064
1065
1066 def geturl(self):
1067 """ Provide the geturl() method, used to be got from
1068 urllib.addinfourl, via. urllib.URLopener.* """
1069 return self.url
1070
1071 def __getattr__(self, name):
1072 """This effectively allows us to wrap at the instance level.
1073 Any attribute not found in _this_ object will be searched for
1074 in self.fo. This includes methods."""
1075
1076 if hasattr(self.fo, name):
1077 return getattr(self.fo, name)
1078 raise AttributeError, name
1079
1080 def _retrieve(self, buf):
1081 try:
1082 if not self._prog_running:
1083 if self.opts.progress_obj:
1084 size = self.size + self._reget_length
1085 self.opts.progress_obj.start(self._prog_reportname,
1086 urllib.unquote(self.url),
1087 self._prog_basename,
1088 size=size,
1089 text=self.opts.text)
1090 self._prog_running = True
1091 self.opts.progress_obj.update(self._amount_read)
1092
1093 self._amount_read += len(buf)
1094 self.fo.write(buf)
1095 return len(buf)
1096 except KeyboardInterrupt:
1097 return -1
1098
1099 def _hdr_retrieve(self, buf):
1100 if self._hdr_ended:
1101 self._hdr_dump = ''
1102 self.size = 0
1103 self._hdr_ended = False
1104
1105 if self._over_max_size(cur=len(self._hdr_dump),
1106 max_size=self.opts.max_header_size):
1107 return -1
1108 try:
1109 self._hdr_dump += buf
1110 # we have to get the size before we do the progress obj start
1111 # but we can't do that w/o making it do 2 connects, which sucks
1112 # so we cheat and stuff it in here in the hdr_retrieve
1113 if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
1114 length = buf.split(':')[1]
1115 self.size = int(length)
1116 elif self.scheme in ['ftp']:
1117 s = None
1118 if buf.startswith('213 '):
1119 s = buf[3:].strip()
1120 elif buf.startswith('150 '):
1121 s = parse150(buf)
1122 if s: