1
=== removed file '.pc/applied-patches'
2
--- .pc/applied-patches	2011-08-09 17:45:08 +0000
3
+++ .pc/applied-patches	1970-01-01 00:00:00 +0000
4
@@ -1,3 +0,0 @@
5
1
grabber_fix.diff
6
2
progress_fix.diff
7
3
progress_object_callback_fix.diff
8
4
0
9
=== removed directory '.pc/grabber_fix.diff'
10
=== removed directory '.pc/grabber_fix.diff/urlgrabber'
11
=== removed file '.pc/grabber_fix.diff/urlgrabber/grabber.py'
12
--- .pc/grabber_fix.diff/urlgrabber/grabber.py	2010-07-08 17:40:08 +0000
13
+++ .pc/grabber_fix.diff/urlgrabber/grabber.py	1970-01-01 00:00:00 +0000
14
@@ -1,1730 +0,0 @@
15
1
#   This library is free software; you can redistribute it and/or
16
2
#   modify it under the terms of the GNU Lesser General Public
17
3
#   License as published by the Free Software Foundation; either
18
4
#   version 2.1 of the License, or (at your option) any later version.
19
5
#
20
6
#   This library is distributed in the hope that it will be useful,
21
7
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
22
8
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23
9
#   Lesser General Public License for more details.
24
10
#
25
11
#   You should have received a copy of the GNU Lesser General Public
26
12
#   License along with this library; if not, write to the 
27
13
#      Free Software Foundation, Inc., 
28
14
#      59 Temple Place, Suite 330, 
29
15
#      Boston, MA  02111-1307  USA
30
16
31
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
32
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
33
19
# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
34
20
35
21
"""A high-level cross-protocol url-grabber.
36
22
37
23
GENERAL ARGUMENTS (kwargs)
38
24
39
25
  Where possible, the module-level default is indicated, and legal
40
26
  values are provided.
41
27
42
28
  copy_local = 0   [0|1]
43
29
44
30
    ignored except for file:// urls, in which case it specifies
45
31
    whether urlgrab should still make a copy of the file, or simply
46
32
    point to the existing copy. The module level default for this
47
33
    option is 0.
48
34
49
35
  close_connection = 0   [0|1]
50
36
51
37
    tells URLGrabber to close the connection after a file has been
52
38
    transfered. This is ignored unless the download happens with the
53
39
    http keepalive handler (keepalive=1).  Otherwise, the connection
54
40
    is left open for further use. The module level default for this
55
41
    option is 0 (keepalive connections will not be closed).
56
42
57
43
  keepalive = 1   [0|1]
58
44
59
45
    specifies whether keepalive should be used for HTTP/1.1 servers
60
46
    that support it. The module level default for this option is 1
61
47
    (keepalive is enabled).
62
48
63
49
  progress_obj = None
64
50
65
51
    a class instance that supports the following methods:
66
52
      po.start(filename, url, basename, length, text)
67
53
      # length will be None if unknown
68
54
      po.update(read) # read == bytes read so far
69
55
      po.end()
70
56
71
57
  text = None
72
58
  
73
59
    specifies alternative text to be passed to the progress meter
74
60
    object.  If not given, the default progress meter will use the
75
61
    basename of the file.
76
62
77
63
  throttle = 1.0
78
64
79
65
    a number - if it's an int, it's the bytes/second throttle limit.
80
66
    If it's a float, it is first multiplied by bandwidth.  If throttle
81
67
    == 0, throttling is disabled.  If None, the module-level default
82
68
    (which can be set on default_grabber.throttle) is used. See
83
69
    BANDWIDTH THROTTLING for more information.
84
70
85
71
  timeout = None
86
72
87
73
    a positive float expressing the number of seconds to wait for socket
88
74
    operations. If the value is None or 0.0, socket operations will block
89
75
    forever. Setting this option causes urlgrabber to call the settimeout
90
76
    method on the Socket object used for the request. See the Python
91
77
    documentation on settimeout for more information.
92
78
    http://www.python.org/doc/current/lib/socket-objects.html
93
79
94
80
  bandwidth = 0
95
81
96
82
    the nominal max bandwidth in bytes/second.  If throttle is a float
97
83
    and bandwidth == 0, throttling is disabled.  If None, the
98
84
    module-level default (which can be set on
99
85
    default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
100
86
    more information.
101
87
102
88
  range = None
103
89
104
90
    a tuple of the form (first_byte, last_byte) describing a byte
105
91
    range to retrieve. Either or both of the values may set to
106
92
    None. If first_byte is None, byte offset 0 is assumed. If
107
93
    last_byte is None, the last byte available is assumed. Note that
108
94
    the range specification is python-like in that (0,10) will yeild
109
95
    the first 10 bytes of the file.
110
96
111
97
    If set to None, no range will be used.
112
98
    
113
99
  reget = None   [None|'simple'|'check_timestamp']
114
100
115
101
    whether to attempt to reget a partially-downloaded file.  Reget
116
102
    only applies to .urlgrab and (obviously) only if there is a
117
103
    partially downloaded file.  Reget has two modes:
118
104
119
105
      'simple' -- the local file will always be trusted.  If there
120
106
        are 100 bytes in the local file, then the download will always
121
107
        begin 100 bytes into the requested file.
122
108
123
109
      'check_timestamp' -- the timestamp of the server file will be
124
110
        compared to the timestamp of the local file.  ONLY if the
125
111
        local file is newer than or the same age as the server file
126
112
        will reget be used.  If the server file is newer, or the
127
113
        timestamp is not returned, the entire file will be fetched.
128
114
129
115
    NOTE: urlgrabber can do very little to verify that the partial
130
116
    file on disk is identical to the beginning of the remote file.
131
117
    You may want to either employ a custom "checkfunc" or simply avoid
132
118
    using reget in situations where corruption is a concern.
133
119
134
120
  user_agent = 'urlgrabber/VERSION'
135
121
136
122
    a string, usually of the form 'AGENT/VERSION' that is provided to
137
123
    HTTP servers in the User-agent header. The module level default
138
124
    for this option is "urlgrabber/VERSION".
139
125
140
126
  http_headers = None
141
127
142
128
    a tuple of 2-tuples, each containing a header and value.  These
143
129
    will be used for http and https requests only.  For example, you
144
130
    can do
145
131
      http_headers = (('Pragma', 'no-cache'),)
146
132
147
133
  ftp_headers = None
148
134
149
135
    this is just like http_headers, but will be used for ftp requests.
150
136
151
137
  proxies = None
152
138
153
139
    a dictionary that maps protocol schemes to proxy hosts. For
154
140
    example, to use a proxy server on host "foo" port 3128 for http
155
141
    and https URLs:
156
142
      proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
157
143
    note that proxy authentication information may be provided using
158
144
    normal URL constructs:
159
145
      proxies={ 'http' : 'http://user:host@foo:3128' }
160
146
    Lastly, if proxies is None, the default environment settings will
161
147
    be used.
162
148
163
149
  prefix = None
164
150
165
151
    a url prefix that will be prepended to all requested urls.  For
166
152
    example:
167
153
      g = URLGrabber(prefix='http://foo.com/mirror/')
168
154
      g.urlgrab('some/file.txt')
169
155
      ## this will fetch 'http://foo.com/mirror/some/file.txt'
170
156
    This option exists primarily to allow identical behavior to
171
157
    MirrorGroup (and derived) instances.  Note: a '/' will be inserted
172
158
    if necessary, so you cannot specify a prefix that ends with a
173
159
    partial file or directory name.
174
160
175
161
  opener = None
176
162
    No-op when using the curl backend (default)
177
163
178
164
  cache_openers = True
179
165
    No-op when using the curl backend (default)
180
166
181
167
  data = None
182
168
183
169
    Only relevant for the HTTP family (and ignored for other
184
170
    protocols), this allows HTTP POSTs.  When the data kwarg is
185
171
    present (and not None), an HTTP request will automatically become
186
172
    a POST rather than GET.  This is done by direct passthrough to
187
173
    urllib2.  If you use this, you may also want to set the
188
174
    'Content-length' and 'Content-type' headers with the http_headers
189
175
    option.  Note that python 2.2 handles the case of these
190
176
    badly and if you do not use the proper case (shown here), your
191
177
    values will be overridden with the defaults.
192
178
    
193
179
  urlparser = URLParser()
194
180
195
181
    The URLParser class handles pre-processing of URLs, including
196
182
    auth-handling for user/pass encoded in http urls, file handing
197
183
    (that is, filenames not sent as a URL), and URL quoting.  If you
198
184
    want to override any of this behavior, you can pass in a
199
185
    replacement instance.  See also the 'quote' option.
200
186
201
187
  quote = None
202
188
203
189
    Whether or not to quote the path portion of a url.
204
190
      quote = 1    ->  quote the URLs (they're not quoted yet)
205
191
      quote = 0    ->  do not quote them (they're already quoted)
206
192
      quote = None ->  guess what to do
207
193
208
194
    This option only affects proper urls like 'file:///etc/passwd'; it
209
195
    does not affect 'raw' filenames like '/etc/passwd'.  The latter
210
196
    will always be quoted as they are converted to URLs.  Also, only
211
197
    the path part of a url is quoted.  If you need more fine-grained
212
198
    control, you should probably subclass URLParser and pass it in via
213
199
    the 'urlparser' option.
214
200
215
201
  ssl_ca_cert = None
216
202
217
203
    this option can be used if M2Crypto is available and will be
218
204
    ignored otherwise.  If provided, it will be used to create an SSL
219
205
    context.  If both ssl_ca_cert and ssl_context are provided, then
220
206
    ssl_context will be ignored and a new context will be created from
221
207
    ssl_ca_cert.
222
208
223
209
  ssl_context = None
224
210
225
211
    No-op when using the curl backend (default)
226
212
   
227
213
228
214
  self.ssl_verify_peer = True 
229
215
230
216
    Check the server's certificate to make sure it is valid with what our CA validates
231
217
  
232
218
  self.ssl_verify_host = True
233
219
234
220
    Check the server's hostname to make sure it matches the certificate DN
235
221
236
222
  self.ssl_key = None
237
223
238
224
    Path to the key the client should use to connect/authenticate with
239
225
240
226
  self.ssl_key_type = 'PEM' 
241
227
242
228
    PEM or DER - format of key
243
229
     
244
230
  self.ssl_cert = None
245
231
246
232
    Path to the ssl certificate the client should use to to authenticate with
247
233
248
234
  self.ssl_cert_type = 'PEM' 
249
235
250
236
    PEM or DER - format of certificate
251
237
    
252
238
  self.ssl_key_pass = None 
253
239
254
240
    password to access the ssl_key
255
241
    
256
242
  self.size = None
257
243
258
244
    size (in bytes) or Maximum size of the thing being downloaded. 
259
245
    This is mostly to keep us from exploding with an endless datastream
260
246
  
261
247
  self.max_header_size = 2097152 
262
248
263
249
    Maximum size (in bytes) of the headers.
264
250
    
265
251
266
252
RETRY RELATED ARGUMENTS
267
253
268
254
  retry = None
269
255
270
256
    the number of times to retry the grab before bailing.  If this is
271
257
    zero, it will retry forever. This was intentional... really, it
272
258
    was :). If this value is not supplied or is supplied but is None
273
259
    retrying does not occur.
274
260
275
261
  retrycodes = [-1,2,4,5,6,7]
276
262
277
263
    a sequence of errorcodes (values of e.errno) for which it should
278
264
    retry. See the doc on URLGrabError for more details on this.  You
279
265
    might consider modifying a copy of the default codes rather than
280
266
    building yours from scratch so that if the list is extended in the
281
267
    future (or one code is split into two) you can still enjoy the
282
268
    benefits of the default list.  You can do that with something like
283
269
    this:
284
270
285
271
      retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
286
272
      if 12 not in retrycodes:
287
273
          retrycodes.append(12)
288
274
      
289
275
  checkfunc = None
290
276
291
277
    a function to do additional checks. This defaults to None, which
292
278
    means no additional checking.  The function should simply return
293
279
    on a successful check.  It should raise URLGrabError on an
294
280
    unsuccessful check.  Raising of any other exception will be
295
281
    considered immediate failure and no retries will occur.
296
282
297
283
    If it raises URLGrabError, the error code will determine the retry
298
284
    behavior.  Negative error numbers are reserved for use by these
299
285
    passed in functions, so you can use many negative numbers for
300
286
    different types of failure.  By default, -1 results in a retry,
301
287
    but this can be customized with retrycodes.
302
288
303
289
    If you simply pass in a function, it will be given exactly one
304
290
    argument: a CallbackObject instance with the .url attribute
305
291
    defined and either .filename (for urlgrab) or .data (for urlread).
306
292
    For urlgrab, .filename is the name of the local file.  For
307
293
    urlread, .data is the actual string data.  If you need other
308
294
    arguments passed to the callback (program state of some sort), you
309
295
    can do so like this:
310
296
311
297
      checkfunc=(function, ('arg1', 2), {'kwarg': 3})
312
298
313
299
    if the downloaded file has filename /tmp/stuff, then this will
314
300
    result in this call (for urlgrab):
315
301
316
302
      function(obj, 'arg1', 2, kwarg=3)
317
303
      # obj.filename = '/tmp/stuff'
318
304
      # obj.url = 'http://foo.com/stuff'
319
305
      
320
306
    NOTE: both the "args" tuple and "kwargs" dict must be present if
321
307
    you use this syntax, but either (or both) can be empty.
322
308
323
309
  failure_callback = None
324
310
325
311
    The callback that gets called during retries when an attempt to
326
312
    fetch a file fails.  The syntax for specifying the callback is
327
313
    identical to checkfunc, except for the attributes defined in the
328
314
    CallbackObject instance.  The attributes for failure_callback are:
329
315
330
316
      exception = the raised exception
331
317
      url       = the url we're trying to fetch
332
318
      tries     = the number of tries so far (including this one)
333
319
      retry     = the value of the retry option
334
320
335
321
    The callback is present primarily to inform the calling program of
336
322
    the failure, but if it raises an exception (including the one it's
337
323
    passed) that exception will NOT be caught and will therefore cause
338
324
    future retries to be aborted.
339
325
340
326
    The callback is called for EVERY failure, including the last one.
341
327
    On the last try, the callback can raise an alternate exception,
342
328
    but it cannot (without severe trickiness) prevent the exception
343
329
    from being raised.
344
330
345
331
  interrupt_callback = None
346
332
347
333
    This callback is called if KeyboardInterrupt is received at any
348
334
    point in the transfer.  Basically, this callback can have three
349
335
    impacts on the fetch process based on the way it exits:
350
336
351
337
      1) raise no exception: the current fetch will be aborted, but
352
338
         any further retries will still take place
353
339
354
340
      2) raise a URLGrabError: if you're using a MirrorGroup, then
355
341
         this will prompt a failover to the next mirror according to
356
342
         the behavior of the MirrorGroup subclass.  It is recommended
357
343
         that you raise URLGrabError with code 15, 'user abort'.  If
358
344
         you are NOT using a MirrorGroup subclass, then this is the
359
345
         same as (3).
360
346
361
347
      3) raise some other exception (such as KeyboardInterrupt), which
362
348
         will not be caught at either the grabber or mirror levels.
363
349
         That is, it will be raised up all the way to the caller.
364
350
365
351
    This callback is very similar to failure_callback.  They are
366
352
    passed the same arguments, so you could use the same function for
367
353
    both.
368
354
      
369
355
BANDWIDTH THROTTLING
370
356
371
357
  urlgrabber supports throttling via two values: throttle and
372
358
  bandwidth Between the two, you can either specify and absolute
373
359
  throttle threshold or specify a theshold as a fraction of maximum
374
360
  available bandwidth.
375
361
376
362
  throttle is a number - if it's an int, it's the bytes/second
377
363
  throttle limit.  If it's a float, it is first multiplied by
378
364
  bandwidth.  If throttle == 0, throttling is disabled.  If None, the
379
365
  module-level default (which can be set with set_throttle) is used.
380
366
381
367
  bandwidth is the nominal max bandwidth in bytes/second.  If throttle
382
368
  is a float and bandwidth == 0, throttling is disabled.  If None, the
383
369
  module-level default (which can be set with set_bandwidth) is used.
384
370
385
371
  THROTTLING EXAMPLES:
386
372
387
373
  Lets say you have a 100 Mbps connection.  This is (about) 10^8 bits
388
374
  per second, or 12,500,000 Bytes per second.  You have a number of
389
375
  throttling options:
390
376
391
377
  *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
392
378
393
379
     This will limit urlgrab to use half of your available bandwidth.
394
380
395
381
  *) set_throttle(6250000) # throttle is an int
396
382
397
383
     This will also limit urlgrab to use half of your available
398
384
     bandwidth, regardless of what bandwidth is set to.
399
385
400
386
  *) set_throttle(6250000); set_throttle(1.0) # float
401
387
402
388
     Use half your bandwidth
403
389
404
390
  *) set_throttle(6250000); set_throttle(2.0) # float
405
391
406
392
    Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
407
393
408
394
  *) set_throttle(6250000); set_throttle(0) # throttle = 0
409
395
410
396
     Disable throttling - this is more efficient than a very large
411
397
     throttle setting.
412
398
413
399
  *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
414
400
415
401
     Disable throttling - this is the default when the module is loaded.
416
402
417
403
  SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
418
404
419
405
  While this is flexible, it's not extremely obvious to the user.  I
420
406
  suggest you implement a float throttle as a percent to make the
421
407
  distinction between absolute and relative throttling very explicit.
422
408
423
409
  Also, you may want to convert the units to something more convenient
424
410
  than bytes/second, such as kbps or kB/s, etc.
425
411
426
412
"""
427
413
428
414
429
415
430
416
import os
431
417
import sys
432
418
import urlparse
433
419
import time
434
420
import string
435
421
import urllib
436
422
import urllib2
437
423
import mimetools
438
424
import thread
439
425
import types
440
426
import stat
441
427
import pycurl
442
428
from ftplib import parse150
443
429
from StringIO import StringIO
444
430
from httplib import HTTPException
445
431
import socket
446
432
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
447
433
448
434
########################################################################
449
435
#                     MODULE INITIALIZATION
450
436
########################################################################
451
437
try:
452
438
    exec('from ' + (__name__.split('.'))[0] + ' import __version__')
453
439
except:
454
440
    __version__ = '???'
455
441
456
442
########################################################################
457
443
# functions for debugging output.  These functions are here because they
458
444
# are also part of the module initialization.
459
445
DEBUG = None
460
446
def set_logger(DBOBJ):
461
447
    """Set the DEBUG object.  This is called by _init_default_logger when
462
448
    the environment variable URLGRABBER_DEBUG is set, but can also be
463
449
    called by a calling program.  Basically, if the calling program uses
464
450
    the logging module and would like to incorporate urlgrabber logging,
465
451
    then it can do so this way.  It's probably not necessary as most
466
452
    internal logging is only for debugging purposes.
467
453
468
454
    The passed-in object should be a logging.Logger instance.  It will
469
455
    be pushed into the keepalive and byterange modules if they're
470
456
    being used.  The mirror module pulls this object in on import, so
471
457
    you will need to manually push into it.  In fact, you may find it
472
458
    tidier to simply push your logging object (or objects) into each
473
459
    of these modules independently.
474
460
    """
475
461
476
462
    global DEBUG
477
463
    DEBUG = DBOBJ
478
464
479
465
def _init_default_logger(logspec=None):
480
466
    '''Examines the environment variable URLGRABBER_DEBUG and creates
481
467
    a logging object (logging.logger) based on the contents.  It takes
482
468
    the form
483
469
484
470
      URLGRABBER_DEBUG=level,filename
485
471
      
486
472
    where "level" can be either an integer or a log level from the
487
473
    logging module (DEBUG, INFO, etc).  If the integer is zero or
488
474
    less, logging will be disabled.  Filename is the filename where
489
475
    logs will be sent.  If it is "-", then stdout will be used.  If
490
476
    the filename is empty or missing, stderr will be used.  If the
491
477
    variable cannot be processed or the logging module cannot be
492
478
    imported (python < 2.3) then logging will be disabled.  Here are
493
479
    some examples:
494
480
495
481
      URLGRABBER_DEBUG=1,debug.txt   # log everything to debug.txt
496
482
      URLGRABBER_DEBUG=WARNING,-     # log warning and higher to stdout
497
483
      URLGRABBER_DEBUG=INFO          # log info and higher to stderr
498
484
      
499
485
    This funtion is called during module initialization.  It is not
500
486
    intended to be called from outside.  The only reason it is a
501
487
    function at all is to keep the module-level namespace tidy and to
502
488
    collect the code into a nice block.'''
503
489
504
490
    try:
505
491
        if logspec is None:
506
492
            logspec = os.environ['URLGRABBER_DEBUG']
507
493
        dbinfo = logspec.split(',')
508
494
        import logging
509
495
        level = logging._levelNames.get(dbinfo[0], None)
510
496
        if level is None: level = int(dbinfo[0])
511
497
        if level < 1: raise ValueError()
512
498
513
499
        formatter = logging.Formatter('%(asctime)s %(message)s')
514
500
        if len(dbinfo) > 1: filename = dbinfo[1]
515
501
        else: filename = ''
516
502
        if filename == '': handler = logging.StreamHandler(sys.stderr)
517
503
        elif filename == '-': handler = logging.StreamHandler(sys.stdout)
518
504
        else:  handler = logging.FileHandler(filename)
519
505
        handler.setFormatter(formatter)
520
506
        DBOBJ = logging.getLogger('urlgrabber')
521
507
        DBOBJ.addHandler(handler)
522
508
        DBOBJ.setLevel(level)
523
509
    except (KeyError, ImportError, ValueError):
524
510
        DBOBJ = None
525
511
    set_logger(DBOBJ)
526
512
527
513
def _log_package_state():
528
514
    if not DEBUG: return
529
515
    DEBUG.info('urlgrabber version  = %s' % __version__)
530
516
    DEBUG.info('trans function "_"  = %s' % _)
531
517
        
532
518
_init_default_logger()
533
519
_log_package_state()
534
520
535
521
536
522
# normally this would be from i18n or something like it ...
537
523
def _(st):
538
524
    return st
539
525
540
526
########################################################################
541
527
#                 END MODULE INITIALIZATION
542
528
########################################################################
543
529
544
530
545
531
546
532
class URLGrabError(IOError):
547
533
    """
548
534
    URLGrabError error codes:
549
535
550
536
      URLGrabber error codes (0 -- 255)
551
537
        0    - everything looks good (you should never see this)
552
538
        1    - malformed url
553
539
        2    - local file doesn't exist
554
540
        3    - request for non-file local file (dir, etc)
555
541
        4    - IOError on fetch
556
542
        5    - OSError on fetch
557
543
        6    - no content length header when we expected one
558
544
        7    - HTTPException
559
545
        8    - Exceeded read limit (for urlread)
560
546
        9    - Requested byte range not satisfiable.
561
547
        10   - Byte range requested, but range support unavailable
562
548
        11   - Illegal reget mode
563
549
        12   - Socket timeout
564
550
        13   - malformed proxy url
565
551
        14   - HTTPError (includes .code and .exception attributes)
566
552
        15   - user abort
567
553
        16   - error writing to local file
568
554
        
569
555
      MirrorGroup error codes (256 -- 511)
570
556
        256  - No more mirrors left to try
571
557
572
558
      Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
573
559
        [ this range reserved for application-specific error codes ]
574
560
575
561
      Retry codes (< 0)
576
562
        -1   - retry the download, unknown reason
577
563
578
564
    Note: to test which group a code is in, you can simply do integer
579
565
    division by 256: e.errno / 256
580
566
581
567
    Negative codes are reserved for use by functions passed in to
582
568
    retrygrab with checkfunc.  The value -1 is built in as a generic
583
569
    retry code and is already included in the retrycodes list.
584
570
    Therefore, you can create a custom check function that simply
585
571
    returns -1 and the fetch will be re-tried.  For more customized
586
572
    retries, you can use other negative number and include them in
587
573
    retry-codes.  This is nice for outputting useful messages about
588
574
    what failed.
589
575
590
576
    You can use these error codes like so:
591
577
      try: urlgrab(url)
592
578
      except URLGrabError, e:
593
579
         if e.errno == 3: ...
594
580
           # or
595
581
         print e.strerror
596
582
           # or simply
597
583
         print e  #### print '[Errno %i] %s' % (e.errno, e.strerror)
598
584
    """
599
585
    def __init__(self, *args):
600
586
        IOError.__init__(self, *args)
601
587
        self.url = "No url specified"
602
588
603
589
class CallbackObject:
604
590
    """Container for returned callback data.
605
591
606
592
    This is currently a dummy class into which urlgrabber can stuff
607
593
    information for passing to callbacks.  This way, the prototype for
608
594
    all callbacks is the same, regardless of the data that will be
609
595
    passed back.  Any function that accepts a callback function as an
610
596
    argument SHOULD document what it will define in this object.
611
597
612
598
    It is possible that this class will have some greater
613
599
    functionality in the future.
614
600
    """
615
601
    def __init__(self, **kwargs):
616
602
        self.__dict__.update(kwargs)
617
603
618
604
def urlgrab(url, filename=None, **kwargs):
619
605
    """grab the file at <url> and make a local copy at <filename>
620
606
    If filename is none, the basename of the url is used.
621
607
    urlgrab returns the filename of the local file, which may be different
622
608
    from the passed-in filename if the copy_local kwarg == 0.
623
609
    
624
610
    See module documentation for a description of possible kwargs.
625
611
    """
626
612
    return default_grabber.urlgrab(url, filename, **kwargs)
627
613
628
614
def urlopen(url, **kwargs):
629
615
    """open the url and return a file object
630
616
    If a progress object or throttle specifications exist, then
631
617
    a special file object will be returned that supports them.
632
618
    The file object can be treated like any other file object.
633
619
    
634
620
    See module documentation for a description of possible kwargs.
635
621
    """
636
622
    return default_grabber.urlopen(url, **kwargs)
637
623
638
624
def urlread(url, limit=None, **kwargs):
639
625
    """read the url into a string, up to 'limit' bytes
640
626
    If the limit is exceeded, an exception will be thrown.  Note that urlread
641
627
    is NOT intended to be used as a way of saying "I want the first N bytes"
642
628
    but rather 'read the whole file into memory, but don't use too much'
643
629
    
644
630
    See module documentation for a description of possible kwargs.
645
631
    """
646
632
    return default_grabber.urlread(url, limit, **kwargs)
647
633
648
634
649
635
class URLParser:
650
636
    """Process the URLs before passing them to urllib2.
651
637
652
638
    This class does several things:
653
639
654
640
      * add any prefix
655
641
      * translate a "raw" file to a proper file: url
656
642
      * handle any http or https auth that's encoded within the url
657
643
      * quote the url
658
644
659
645
    Only the "parse" method is called directly, and it calls sub-methods.
660
646
661
647
    An instance of this class is held in the options object, which
662
648
    means that it's easy to change the behavior by sub-classing and
663
649
    passing the replacement in.  It need only have a method like:
664
650
665
651
        url, parts = urlparser.parse(url, opts)
666
652
    """
667
653
668
654
    def parse(self, url, opts):
669
655
        """parse the url and return the (modified) url and its parts
670
656
671
657
        Note: a raw file WILL be quoted when it's converted to a URL.
672
658
        However, other urls (ones which come with a proper scheme) may
673
659
        or may not be quoted according to opts.quote
674
660
675
661
          opts.quote = 1     --> quote it
676
662
          opts.quote = 0     --> do not quote it
677
663
          opts.quote = None  --> guess
678
664
        """
679
665
        quote = opts.quote
680
666
        
681
667
        if opts.prefix:
682
668
            url = self.add_prefix(url, opts.prefix)
683
669
            
684
670
        parts = urlparse.urlparse(url)
685
671
        (scheme, host, path, parm, query, frag) = parts
686
672
687
673
        if not scheme or (len(scheme) == 1 and scheme in string.letters):
688
674
            # if a scheme isn't specified, we guess that it's "file:"
689
675
            if url[0] not in '/\\': url = os.path.abspath(url)
690
676
            url = 'file:' + urllib.pathname2url(url)
691
677
            parts = urlparse.urlparse(url)
692
678
            quote = 0 # pathname2url quotes, so we won't do it again
693
679
            
694
680
        if scheme in ['http', 'https']:
695
681
            parts = self.process_http(parts, url)
696
682
            
697
683
        if quote is None:
698
684
            quote = self.guess_should_quote(parts)
699
685
        if quote:
700
686
            parts = self.quote(parts)
701
687
        
702
688
        url = urlparse.urlunparse(parts)
703
689
        return url, parts
704
690
705
691
    def add_prefix(self, url, prefix):
706
692
        if prefix[-1] == '/' or url[0] == '/':
707
693
            url = prefix + url
708
694
        else:
709
695
            url = prefix + '/' + url
710
696
        return url
711
697
712
698
    def process_http(self, parts, url):
713
699
        (scheme, host, path, parm, query, frag) = parts
714
700
        # TODO: auth-parsing here, maybe? pycurl doesn't really need it
715
701
        return (scheme, host, path, parm, query, frag)
716
702
717
703
    def quote(self, parts):
718
704
        """quote the URL
719
705
720
706
        This method quotes ONLY the path part.  If you need to quote
721
707
        other parts, you should override this and pass in your derived
722
708
        class.  The other alternative is to quote other parts before
723
709
        passing into urlgrabber.
724
710
        """
725
711
        (scheme, host, path, parm, query, frag) = parts
726
712
        path = urllib.quote(path)
727
713
        return (scheme, host, path, parm, query, frag)
728
714
729
715
    hexvals = '0123456789ABCDEF'
730
716
    def guess_should_quote(self, parts):
731
717
        """
732
718
        Guess whether we should quote a path.  This amounts to
733
719
        guessing whether it's already quoted.
734
720
735
721
        find ' '   ->  1
736
722
        find '%'   ->  1
737
723
        find '%XX' ->  0
738
724
        else       ->  1
739
725
        """
740
726
        (scheme, host, path, parm, query, frag) = parts
741
727
        if ' ' in path:
742
728
            return 1
743
729
        ind = string.find(path, '%')
744
730
        if ind > -1:
745
731
            while ind > -1:
746
732
                if len(path) < ind+3:
747
733
                    return 1
748
734
                code = path[ind+1:ind+3].upper()
749
735
                if     code[0] not in self.hexvals or \
750
736
                       code[1] not in self.hexvals:
751
737
                    return 1
752
738
                ind = string.find(path, '%', ind+1)
753
739
            return 0
754
740
        return 1
755
741
    
756
742
class URLGrabberOptions:
757
743
    """Class to ease kwargs handling."""
758
744
759
745
    def __init__(self, delegate=None, **kwargs):
760
746
        """Initialize URLGrabberOptions object.
761
747
        Set default values for all options and then update options specified
762
748
        in kwargs.
763
749
        """
764
750
        self.delegate = delegate
765
751
        if delegate is None:
766
752
            self._set_defaults()
767
753
        self._set_attributes(**kwargs)
768
754
    
769
755
    def __getattr__(self, name):
770
756
        if self.delegate and hasattr(self.delegate, name):
771
757
            return getattr(self.delegate, name)
772
758
        raise AttributeError, name
773
759
    
774
760
    def raw_throttle(self):
775
761
        """Calculate raw throttle value from throttle and bandwidth 
776
762
        values.
777
763
        """
778
764
        if self.throttle <= 0:  
779
765
            return 0
780
766
        elif type(self.throttle) == type(0): 
781
767
            return float(self.throttle)
782
768
        else: # throttle is a float
783
769
            return self.bandwidth * self.throttle
784
770
        
785
771
    def derive(self, **kwargs):
786
772
        """Create a derived URLGrabberOptions instance.
787
773
        This method creates a new instance and overrides the
788
774
        options specified in kwargs.
789
775
        """
790
776
        return URLGrabberOptions(delegate=self, **kwargs)
791
777
        
792
778
    def _set_attributes(self, **kwargs):
793
779
        """Update object attributes with those provided in kwargs."""
794
780
        self.__dict__.update(kwargs)
795
781
        if kwargs.has_key('range'):
796
782
            # normalize the supplied range value
797
783
            self.range = range_tuple_normalize(self.range)
798
784
        if not self.reget in [None, 'simple', 'check_timestamp']:
799
785
            raise URLGrabError(11, _('Illegal reget mode: %s') \
800
786
                               % (self.reget, ))
801
787
802
788
    def _set_defaults(self):
803
789
        """Set all options to their default values. 
804
790
        When adding new options, make sure a default is
805
791
        provided here.
806
792
        """
807
793
        self.progress_obj = None
808
794
        self.throttle = 1.0
809
795
        self.bandwidth = 0
810
796
        self.retry = None
811
797
        self.retrycodes = [-1,2,4,5,6,7]
812
798
        self.checkfunc = None
813
799
        self.copy_local = 0
814
800
        self.close_connection = 0
815
801
        self.range = None
816
802
        self.user_agent = 'urlgrabber/%s' % __version__
817
803
        self.keepalive = 1
818
804
        self.proxies = None
819
805
        self.reget = None
820
806
        self.failure_callback = None
821
807
        self.interrupt_callback = None
822
808
        self.prefix = None
823
809
        self.opener = None
824
810
        self.cache_openers = True
825
811
        self.timeout = None
826
812
        self.text = None
827
813
        self.http_headers = None
828
814
        self.ftp_headers = None
829
815
        self.data = None
830
816
        self.urlparser = URLParser()
831
817
        self.quote = None
832
818
        self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
833
819
        self.ssl_context = None # no-op in pycurl
834
820
        self.ssl_verify_peer = True # check peer's cert for authenticityb
835
821
        self.ssl_verify_host = True # make sure who they are and who the cert is for matches
836
822
        self.ssl_key = None # client key
837
823
        self.ssl_key_type = 'PEM' #(or DER)
838
824
        self.ssl_cert = None # client cert
839
825
        self.ssl_cert_type = 'PEM' # (or DER)
840
826
        self.ssl_key_pass = None # password to access the key
841
827
        self.size = None # if we know how big the thing we're getting is going
842
828
                         # to be. this is ultimately a MAXIMUM size for the file
843
829
        self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
844
830
        
845
831
    def __repr__(self):
846
832
        return self.format()
847
833
        
848
834
    def format(self, indent='  '):
849
835
        keys = self.__dict__.keys()
850
836
        if self.delegate is not None:
851
837
            keys.remove('delegate')
852
838
        keys.sort()
853
839
        s = '{\n'
854
840
        for k in keys:
855
841
            s = s + indent + '%-15s: %s,\n' % \
856
842
                (repr(k), repr(self.__dict__[k]))
857
843
        if self.delegate:
858
844
            df = self.delegate.format(indent + '  ')
859
845
            s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
860
846
        s = s + indent + '}'
861
847
        return s
862
848
863
849
class URLGrabber:
864
850
    """Provides easy opening of URLs with a variety of options.
865
851
    
866
852
    All options are specified as kwargs. Options may be specified when
867
853
    the class is created and may be overridden on a per request basis.
868
854
    
869
855
    New objects inherit default values from default_grabber.
870
856
    """
871
857
    
872
858
    def __init__(self, **kwargs):
873
859
        self.opts = URLGrabberOptions(**kwargs)
874
860
    
875
861
    def _retry(self, opts, func, *args):
876
862
        tries = 0
877
863
        while 1:
878
864
            # there are only two ways out of this loop.  The second has
879
865
            # several "sub-ways"
880
866
            #   1) via the return in the "try" block
881
867
            #   2) by some exception being raised
882
868
            #      a) an excepton is raised that we don't "except"
883
869
            #      b) a callback raises ANY exception
884
870
            #      c) we're not retry-ing or have run out of retries
885
871
            #      d) the URLGrabError code is not in retrycodes
886
872
            # beware of infinite loops :)
887
873
            tries = tries + 1
888
874
            exception = None
889
875
            retrycode = None
890
876
            callback  = None
891
877
            if DEBUG: DEBUG.info('attempt %i/%s: %s',
892
878
                                 tries, opts.retry, args[0])
893
879
            try:
894
880
                r = apply(func, (opts,) + args, {})
895
881
                if DEBUG: DEBUG.info('success')
896
882
                return r
897
883
            except URLGrabError, e:
898
884
                exception = e
899
885
                callback = opts.failure_callback
900
886
                retrycode = e.errno
901
887
            except KeyboardInterrupt, e:
902
888
                exception = e
903
889
                callback = opts.interrupt_callback
904
890
905
891
            if DEBUG: DEBUG.info('exception: %s', exception)
906
892
            if callback:
907
893
                if DEBUG: DEBUG.info('calling callback: %s', callback)
908
894
                cb_func, cb_args, cb_kwargs = self._make_callback(callback)
909
895
                obj = CallbackObject(exception=exception, url=args[0],
910
896
                                     tries=tries, retry=opts.retry)
911
897
                cb_func(obj, *cb_args, **cb_kwargs)
912
898
913
899
            if (opts.retry is None) or (tries == opts.retry):
914
900
                if DEBUG: DEBUG.info('retries exceeded, re-raising')
915
901
                raise
916
902
917
903
            if (retrycode is not None) and (retrycode not in opts.retrycodes):
918
904
                if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
919
905
                                     retrycode, opts.retrycodes)
920
906
                raise
921
907
    
922
908
    def urlopen(self, url, **kwargs):
923
909
        """open the url and return a file object
924
910
        If a progress object or throttle value specified when this 
925
911
        object was created, then  a special file object will be 
926
912
        returned that supports them. The file object can be treated 
927
913
        like any other file object.
928
914
        """
929
915
        opts = self.opts.derive(**kwargs)
930
916
        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
931
917
        (url,parts) = opts.urlparser.parse(url, opts) 
932
918
        def retryfunc(opts, url):
933
919
            return PyCurlFileObject(url, filename=None, opts=opts)
934
920
        return self._retry(opts, retryfunc, url)
935
921
    
936
922
    def urlgrab(self, url, filename=None, **kwargs):
937
923
        """grab the file at <url> and make a local copy at <filename>
938
924
        If filename is none, the basename of the url is used.
939
925
        urlgrab returns the filename of the local file, which may be 
940
926
        different from the passed-in filename if copy_local == 0.
941
927
        """
942
928
        opts = self.opts.derive(**kwargs)
943
929
        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
944
930
        (url,parts) = opts.urlparser.parse(url, opts) 
945
931
        (scheme, host, path, parm, query, frag) = parts
946
932
        if filename is None:
947
933
            filename = os.path.basename( urllib.unquote(path) )
948
934
        if scheme == 'file' and not opts.copy_local:
949
935
            # just return the name of the local file - don't make a 
950
936
            # copy currently
951
937
            path = urllib.url2pathname(path)
952
938
            if host:
953
939
                path = os.path.normpath('//' + host + path)
954
940
            if not os.path.exists(path):
955
941
                err = URLGrabError(2, 
956
942
                      _('Local file does not exist: %s') % (path, ))
957
943
                err.url = url
958
944
                raise err
959
945
            elif not os.path.isfile(path):
960
946
                err = URLGrabError(3, 
961
947
                                 _('Not a normal file: %s') % (path, ))
962
948
                err.url = url
963
949
                raise err
964
950
965
951
            elif not opts.range:
966
952
                if not opts.checkfunc is None:
967
953
                    cb_func, cb_args, cb_kwargs = \
968
954
                       self._make_callback(opts.checkfunc)
969
955
                    obj = CallbackObject()
970
956
                    obj.filename = path
971
957
                    obj.url = url
972
958
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)        
973
959
                return path
974
960
        
975
961
        def retryfunc(opts, url, filename):
976
962
            fo = PyCurlFileObject(url, filename, opts)
977
963
            try:
978
964
                fo._do_grab()
979
965
                if not opts.checkfunc is None:
980
966
                    cb_func, cb_args, cb_kwargs = \
981
967
                             self._make_callback(opts.checkfunc)
982
968
                    obj = CallbackObject()
983
969
                    obj.filename = filename
984
970
                    obj.url = url
985
971
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
986
972
            finally:
987
973
                fo.close()
988
974
            return filename
989
975
        
990
976
        return self._retry(opts, retryfunc, url, filename)
991
977
    
992
978
    def urlread(self, url, limit=None, **kwargs):
993
979
        """read the url into a string, up to 'limit' bytes
994
980
        If the limit is exceeded, an exception will be thrown.  Note
995
981
        that urlread is NOT intended to be used as a way of saying 
996
982
        "I want the first N bytes" but rather 'read the whole file 
997
983
        into memory, but don't use too much'
998
984
        """
999
985
        opts = self.opts.derive(**kwargs)
1000
986
        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
1001
987
        (url,parts) = opts.urlparser.parse(url, opts) 
1002
988
        if limit is not None:
1003
989
            limit = limit + 1
1004
990
            
1005
991
        def retryfunc(opts, url, limit):
1006
992
            fo = PyCurlFileObject(url, filename=None, opts=opts)
1007
993
            s = ''
1008
994
            try:
1009
995
                # this is an unfortunate thing.  Some file-like objects
1010
996
                # have a default "limit" of None, while the built-in (real)
1011
997
                # file objects have -1.  They each break the other, so for
1012
998
                # now, we just force the default if necessary.
1013
999
                if limit is None: s = fo.read()
1014
1000
                else: s = fo.read(limit)
1015
1001
1016
1002
                if not opts.checkfunc is None:
1017
1003
                    cb_func, cb_args, cb_kwargs = \
1018
1004
                             self._make_callback(opts.checkfunc)
1019
1005
                    obj = CallbackObject()
1020
1006
                    obj.data = s
1021
1007
                    obj.url = url
1022
1008
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
1023
1009
            finally:
1024
1010
                fo.close()
1025
1011
            return s
1026
1012
            
1027
1013
        s = self._retry(opts, retryfunc, url, limit)
1028
1014
        if limit and len(s) > limit:
1029
1015
            err = URLGrabError(8, 
1030
1016
                               _('Exceeded limit (%i): %s') % (limit, url))
1031
1017
            err.url = url
1032
1018
            raise err
1033
1019
1034
1020
        return s
1035
1021
        
1036
1022
    def _make_callback(self, callback_obj):
1037
1023
        if callable(callback_obj):
1038
1024
            return callback_obj, (), {}
1039
1025
        else:
1040
1026
            return callback_obj
1041
1027
1042
1028
# create the default URLGrabber used by urlXXX functions.
1043
1029
# NOTE: actual defaults are set in URLGrabberOptions
1044
1030
default_grabber = URLGrabber()
1045
1031
1046
1032
1047
1033
class PyCurlFileObject():
1048
1034
    def __init__(self, url, filename, opts):
1049
1035
        self.fo = None
1050
1036
        self._hdr_dump = ''
1051
1037
        self._parsed_hdr = None
1052
1038
        self.url = url
1053
1039
        self.scheme = urlparse.urlsplit(self.url)[0]
1054
1040
        self.filename = filename
1055
1041
        self.append = False
1056
1042
        self.reget_time = None
1057
1043
        self.opts = opts
1058
1044
        if self.opts.reget == 'check_timestamp':
1059
1045
            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
1060
1046
        self._complete = False
1061
1047
        self._rbuf = ''
1062
1048
        self._rbufsize = 1024*8
1063
1049
        self._ttime = time.time()
1064
1050
        self._tsize = 0
1065
1051
        self._amount_read = 0
1066
1052
        self._reget_length = 0
1067
1053
        self._prog_running = False
1068
1054
        self._error = (None, None)
1069
1055
        self.size = None
1070
1056
        self._do_open()
1071
1057
        
1072
1058
        
1073
1059
    def __getattr__(self, name):
1074
1060
        """This effectively allows us to wrap at the instance level.
1075
1061
        Any attribute not found in _this_ object will be searched for
1076
1062
        in self.fo.  This includes methods."""
1077
1063
1078
1064
        if hasattr(self.fo, name):
1079
1065
            return getattr(self.fo, name)
1080
1066
        raise AttributeError, name
1081
1067
1082
1068
    def _retrieve(self, buf):
1083
1069
        try:
1084
1070
            if not self._prog_running:
1085
1071
                if self.opts.progress_obj:
1086
1072
                    size  = self.size + self._reget_length
1087
1073
                    self.opts.progress_obj.start(self._prog_reportname, 
1088
1074
                                                 urllib.unquote(self.url), 
1089
1075
                                                 self._prog_basename, 
1090
1076
                                                 size=size,
1091
1077
                                                 text=self.opts.text)
1092
1078
                    self._prog_running = True
1093
1079
                    self.opts.progress_obj.update(self._amount_read)
1094
1080
1095
1081
            self._amount_read += len(buf)
1096
1082
            self.fo.write(buf)
1097
1083
            return len(buf)
1098
1084
        except KeyboardInterrupt:
1099
1085
            return -1
1100
1086
            
1101
1087
    def _hdr_retrieve(self, buf):
1102
1088
        if self._over_max_size(cur=len(self._hdr_dump), 
1103
1089
                               max_size=self.opts.max_header_size):
1104
1090
            return -1            
1105
1091
        try:
1106
1092
            self._hdr_dump += buf
1107
1093
            # we have to get the size before we do the progress obj start
1108
1094
            # but we can't do that w/o making it do 2 connects, which sucks
1109
1095
            # so we cheat and stuff it in here in the hdr_retrieve
1110
1096
            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
1111
1097
                length = buf.split(':')[1]
1112
1098
                self.size = int(length)
1113
1099
            elif self.scheme in ['ftp']:
1114
1100
                s = None
1115
1101
                if buf.startswith('213 '):
1116
1102
                    s = buf[3:].strip()
1117
1103
                elif buf.startswith('150 '):
1118
1104
                    s = parse150(buf)
1119
1105
                if s:
1120
1106
                    self.size = int(s)
1121
1107
            
1122
1108
            return len(buf)
1123
1109
        except KeyboardInterrupt:
1124
1110
            return pycurl.READFUNC_ABORT
1125
1111
1126
1112
    def _return_hdr_obj(self):
1127
1113
        if self._parsed_hdr:
1128
1114
            return self._parsed_hdr
1129
1115
        statusend = self._hdr_dump.find('\n')
1130
1116
        hdrfp = StringIO()
1131
1117
        hdrfp.write(self._hdr_dump[statusend:])
1132
1118
        self._parsed_hdr =  mimetools.Message(hdrfp)
1133
1119
        return self._parsed_hdr
1134
1120
    
1135
1121
    hdr = property(_return_hdr_obj)
1136
1122
    http_code = property(fget=
1137
1123
                 lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
1138
1124
1139
1125
    def _set_opts(self, opts={}):
1140
1126
        # XXX
1141
1127
        if not opts:
1142
1128
            opts = self.opts
1143
1129
1144
1130
1145
1131
        # defaults we're always going to set
1146
1132
        self.curl_obj.setopt(pycurl.NOPROGRESS, False)
1147
1133
        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
1148
1134
        self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
1149
1135
        self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
1150
1136
        self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
1151
1137
        self.curl_obj.setopt(pycurl.FAILONERROR, True)
1152
1138
        self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
1153
1139
        
1154
1140
        if DEBUG:
1155
1141
            self.curl_obj.setopt(pycurl.VERBOSE, True)
1156
1142
        if opts.user_agent:
1157
1143
            self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
1158
1144
        
1159
1145
        # maybe to be options later
1160
1146
        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
1161
1147
        self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
1162
1148
        
1163
1149
        # timeouts
1164
1150
        timeout = 300
1165
1151
        if opts.timeout:
1166
1152
            timeout = int(opts.timeout)
1167
1153
            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
1168
1154
1169
1155
        # ssl options
1170
1156
        if self.scheme == 'https':
1171
1157
            if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
1172
1158
                self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
1173
1159
                self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
1174
1160
            self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
1175
1161
            self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
1176
1162
            if opts.ssl_key:
1177
1163
                self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
1178
1164
            if opts.ssl_key_type:
1179
1165
                self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
1180
1166
            if opts.ssl_cert:
1181
1167
                self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
1182
1168
            if opts.ssl_cert_type:                
1183
1169
                self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
1184
1170
            if opts.ssl_key_pass:
1185
1171
                self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
1186
1172
1187
1173
        #headers:
1188
1174
        if opts.http_headers and self.scheme in ('http', 'https'):
1189
1175
            headers = []
1190
1176
            for (tag, content) in opts.http_headers:
1191
1177
                headers.append('%s:%s' % (tag, content))
1192
1178
            self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
1193
1179
1194
1180
        # ranges:
1195
1181
        if opts.range or opts.reget:
1196
1182
            range_str = self._build_range()
1197
1183
            if range_str:
1198
1184
                self.curl_obj.setopt(pycurl.RANGE, range_str)
1199
1185
            
1200
1186
        # throttle/bandwidth
1201
1187
        if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
1202
1188
            self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
1203
1189
            
1204
1190
        # proxy settings
1205
1191
        if opts.proxies:
1206
1192
            for (scheme, proxy) in opts.proxies.items():
1207
1193
                if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
1208
1194
                    if scheme not in ('ftp'):
1209
1195
                        continue
1210
1196
                    else:
1211
1197
                        if proxy == '_none_': proxy = ""
1212
1198
                        self.curl_obj.setopt(pycurl.PROXY, proxy)
1213
1199
                elif self.scheme in ('http', 'https'):
1214
1200
                    if scheme not in ('http', 'https'):
1215
1201
                        continue
1216
1202
                    else:
1217
1203
                        if proxy == '_none_': proxy = ""
1218
1204
                        self.curl_obj.setopt(pycurl.PROXY, proxy)
1219
1205
            
1220
1206
        # FIXME username/password/auth settings
1221
1207
1222
1208
        #posts - simple - expects the fields as they are
1223
1209
        if opts.data:
1224
1210
            self.curl_obj.setopt(pycurl.POST, True)
1225
1211
            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
1226
1212
            
1227
1213
        # our url
1228
1214
        self.curl_obj.setopt(pycurl.URL, self.url)
1229
1215
        
1230
1216
    
1231
1217
    def _do_perform(self):
1232
1218
        if self._complete:
1233
1219
            return
1234
1220
        
1235
1221
        try:
1236
1222
            self.curl_obj.perform()
1237
1223
        except pycurl.error, e:
1238
1224
            # XXX - break some of these out a bit more clearly
1239
1225
            # to other URLGrabErrors from 
1240
1226
            # http://curl.haxx.se/libcurl/c/libcurl-errors.html
1241
1227
            # this covers e.args[0] == 22 pretty well - which will be common
1242
1228
            
1243
1229
            code = self.http_code
1244
1230
            errcode = e.args[0]
1245
1231
            if self._error[0]:
1246
1232
                errcode = self._error[0]
1247
1233
                
1248
1234
            if errcode == 23 and code >= 200 and code < 299:
1249
1235
                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
1250
1236
                err.url = self.url
1251
1237
                
1252
1238
                # this is probably wrong but ultimately this is what happens
1253
1239
                # we have a legit http code and a pycurl 'writer failed' code
1254
1240
                # which almost always means something aborted it from outside
1255
1241
                # since we cannot know what it is -I'm banking on it being
1256
1242
                # a ctrl-c. XXXX - if there's a way of going back two raises to 
1257
1243
                # figure out what aborted the pycurl process FIXME
1258
1244
                raise KeyboardInterrupt
1259
1245
            
1260
1246
            elif errcode == 28:
1261
1247
                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1262
1248
                err.url = self.url
1263
1249
                raise err
1264
1250
            elif errcode == 35:
1265
1251
                msg = _("problem making ssl connection")
1266
1252
                err = URLGrabError(14, msg)
1267
1253
                err.url = self.url
1268
1254
                raise err
1269
1255
            elif errcode == 37:
1270
1256
                msg = _("Could not open/read %s") % (self.url)
1271
1257
                err = URLGrabError(14, msg)
1272
1258
                err.url = self.url
1273
1259
                raise err
1274
1260
                
1275
1261
            elif errcode == 42:
1276
1262
                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
1277
1263
                err.url = self.url
1278
1264
                # this is probably wrong but ultimately this is what happens
1279
1265
                # we have a legit http code and a pycurl 'writer failed' code
1280
1266
                # which almost always means something aborted it from outside
1281
1267
                # since we cannot know what it is -I'm banking on it being
1282
1268
                # a ctrl-c. XXXX - if there's a way of going back two raises to 
1283
1269
                # figure out what aborted the pycurl process FIXME
1284
1270
                raise KeyboardInterrupt
1285
1271
                
1286
1272
            elif errcode == 58:
1287
1273
                msg = _("problem with the local client certificate")
1288
1274
                err = URLGrabError(14, msg)
1289
1275
                err.url = self.url
1290
1276
                raise err
1291
1277
1292
1278
            elif errcode == 60:
1293
1279
                msg = _("client cert cannot be verified or client cert incorrect")
1294
1280
                err = URLGrabError(14, msg)
1295
1281
                err.url = self.url
1296
1282
                raise err
1297
1283
            
1298
1284
            elif errcode == 63:
1299
1285
                if self._error[1]:
1300
1286
                    msg = self._error[1]
1301
1287
                else:
1302
1288
                    msg = _("Max download size exceeded on %s") % (self.url)
1303
1289
                err = URLGrabError(14, msg)
1304
1290
                err.url = self.url
1305
1291
                raise err
1306
1292
                    
1307
1293
            elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
1308
1294
                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
1309
1295
            else:
1310
1296
                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
1311
1297
                code = errcode
1312
1298
            err = URLGrabError(14, msg)
1313
1299
            err.code = code
1314
1300
            err.exception = e
1315
1301
            raise err
1316
1302
1317
1303
    def _do_open(self):
1318
1304
        self.curl_obj = _curl_cache
1319
1305
        self.curl_obj.reset() # reset all old settings away, just in case
1320
1306
        # setup any ranges
1321
1307
        self._set_opts()
1322
1308
        self._do_grab()
1323
1309
        return self.fo
1324
1310
1325
1311
    def _add_headers(self):
1326
1312
        pass
1327
1313
        
1328
1314
    def _build_range(self):
1329
1315
        reget_length = 0
1330
1316
        rt = None
1331
1317
        if self.opts.reget and type(self.filename) in types.StringTypes:
1332
1318
            # we have reget turned on and we're dumping to a file
1333
1319
            try:
1334
1320
                s = os.stat(self.filename)
1335
1321
            except OSError:
1336
1322
                pass
1337
1323
            else:
1338
1324
                self.reget_time = s[stat.ST_MTIME]
1339
1325
                reget_length = s[stat.ST_SIZE]
1340
1326
1341
1327
                # Set initial length when regetting
1342
1328
                self._amount_read = reget_length    
1343
1329
                self._reget_length = reget_length # set where we started from, too
1344
1330
1345
1331
                rt = reget_length, ''
1346
1332
                self.append = 1
1347
1333
                
1348
1334
        if self.opts.range:
1349
1335
            rt = self.opts.range
1350
1336
            if rt[0]: rt = (rt[0] + reget_length, rt[1])
1351
1337
1352
1338
        if rt:
1353
1339
            header = range_tuple_to_header(rt)
1354
1340
            if header:
1355
1341
                return header.split('=')[1]
1356
1342
1357
1343
1358
1344
1359
1345
    def _make_request(self, req, opener):
1360
1346
        #XXXX
1361
1347
        # This doesn't do anything really, but we could use this
1362
1348
        # instead of do_open() to catch a lot of crap errors as 
1363
1349
        # mstenner did before here
1364
1350
        return (self.fo, self.hdr)
1365
1351
        
1366
1352
        try:
1367
1353
            if self.opts.timeout:
1368
1354
                old_to = socket.getdefaulttimeout()
1369
1355
                socket.setdefaulttimeout(self.opts.timeout)
1370
1356
                try:
1371
1357
                    fo = opener.open(req)
1372
1358
                finally:
1373
1359
                    socket.setdefaulttimeout(old_to)
1374
1360
            else:
1375
1361
                fo = opener.open(req)
1376
1362
            hdr = fo.info()
1377
1363
        except ValueError, e:
1378
1364
            err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
1379
1365
            err.url = self.url
1380
1366
            raise err
1381
1367
1382
1368
        except RangeError, e:
1383
1369
            err = URLGrabError(9, _('%s on %s') % (e, self.url))
1384
1370
            err.url = self.url
1385
1371
            raise err
1386
1372
        except urllib2.HTTPError, e:
1387
1373
            new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
1388
1374
            new_e.code = e.code
1389
1375
            new_e.exception = e
1390
1376
            new_e.url = self.url
1391
1377
            raise new_e
1392
1378
        except IOError, e:
1393
1379
            if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
1394
1380
                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1395
1381
                err.url = self.url
1396
1382
                raise err
1397
1383
            else:
1398
1384
                err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
1399
1385
                err.url = self.url
1400
1386
                raise err
1401
1387
1402
1388
        except OSError, e:
1403
1389
            err = URLGrabError(5, _('%s on %s') % (e, self.url))
1404
1390
            err.url = self.url
1405
1391
            raise err
1406
1392
1407
1393
        except HTTPException, e:
1408
1394
            err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
1409
1395
                            (e.__class__.__name__, self.url, e))
1410
1396
            err.url = self.url
1411
1397
            raise err
1412
1398
1413
1399
        else:
1414
1400
            return (fo, hdr)
1415
1401
        
1416
1402
    def _do_grab(self):
1417
1403
        """dump the file to a filename or StringIO buffer"""
1418
1404
1419
1405
        if self._complete:
1420
1406
            return
1421
1407
        _was_filename = False
1422
1408
        if type(self.filename) in types.StringTypes and self.filename:
1423
1409
            _was_filename = True
1424
1410
            self._prog_reportname = str(self.filename)
1425
1411
            self._prog_basename = os.path.basename(self.filename)
1426
1412
            
1427
1413
            if self.append: mode = 'ab'
1428
1414
            else: mode = 'wb'
1429
1415
1430
1416
            if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
1431
1417
                                 (self.filename, mode))
1432
1418
            try:
1433
1419
                self.fo = open(self.filename, mode)
1434
1420
            except IOError, e:
1435
1421
                err = URLGrabError(16, _(\
1436
1422
                  'error opening local file from %s, IOError: %s') % (self.url, e))
1437
1423
                err.url = self.url
1438
1424
                raise err
1439
1425
1440
1426
        else:
1441
1427
            self._prog_reportname = 'MEMORY'
1442
1428
            self._prog_basename = 'MEMORY'
1443
1429
1444
1430
            
1445
1431
            self.fo = StringIO()
1446
1432
            # if this is to be a tempfile instead....
1447
1433
            # it just makes crap in the tempdir
1448
1434
            #fh, self._temp_name = mkstemp()
1449
1435
            #self.fo = open(self._temp_name, 'wb')
1450
1436
1451
1437
            
1452
1438
        self._do_perform()
1453
1439
        
1454
1440
1455
1441
1456
1442
        if _was_filename:
1457
1443
            # close it up
1458
1444
            self.fo.flush()
1459
1445
            self.fo.close()
1460
1446
            # set the time
1461
1447
            mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
1462
1448
            if mod_time != -1:
1463
1449
                os.utime(self.filename, (mod_time, mod_time))
1464
1450
            # re open it
1465
1451
            self.fo = open(self.filename, 'r')
1466
1452
        else:
1467
1453
            #self.fo = open(self._temp_name, 'r')
1468
1454
            self.fo.seek(0)
1469
1455
1470
1456
        self._complete = True
1471
1457
    
1472
1458
    def _fill_buffer(self, amt=None):
1473
1459
        """fill the buffer to contain at least 'amt' bytes by reading
1474
1460
        from the underlying file object.  If amt is None, then it will
1475
1461
        read until it gets nothing more.  It updates the progress meter
1476
1462
        and throttles after every self._rbufsize bytes."""
1477
1463
        # the _rbuf test is only in this first 'if' for speed.  It's not
1478
1464
        # logically necessary
1479
1465
        if self._rbuf and not amt is None:
1480
1466
            L = len(self._rbuf)
1481
1467
            if amt > L:
1482
1468
                amt = amt - L
1483
1469
            else:
1484
1470
                return
1485
1471
1486
1472
        # if we've made it here, then we don't have enough in the buffer
1487
1473
        # and we need to read more.
1488
1474
        
1489
1475
        if not self._complete: self._do_grab() #XXX cheater - change on ranges
1490
1476
        
1491
1477
        buf = [self._rbuf]
1492
1478
        bufsize = len(self._rbuf)
1493
1479
        while amt is None or amt:
1494
1480
            # first, delay if necessary for throttling reasons
1495
1481
            if self.opts.raw_throttle():
1496
1482
                diff = self._tsize/self.opts.raw_throttle() - \
1497
1483
                       (time.time() - self._ttime)
1498
1484
                if diff > 0: time.sleep(diff)
1499
1485
                self._ttime = time.time()
1500
1486
                
1501
1487
            # now read some data, up to self._rbufsize
1502
1488
            if amt is None: readamount = self._rbufsize
1503
1489
            else:           readamount = min(amt, self._rbufsize)
1504
1490
            try:
1505
1491
                new = self.fo.read(readamount)
1506
1492
            except socket.error, e:
1507
1493
                err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
1508
1494
                err.url = self.url
1509
1495
                raise err
1510
1496
1511
1497
            except socket.timeout, e:
1512
1498
                raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1513
1499
                err.url = self.url
1514
1500
                raise err
1515
1501
1516
1502
            except IOError, e:
1517
1503
                raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
1518
1504
                err.url = self.url
1519
1505
                raise err
1520
1506
1521
1507
            newsize = len(new)
1522
1508
            if not newsize: break # no more to read
1523
1509
1524
1510
            if amt: amt = amt - newsize
1525
1511
            buf.append(new)
1526
1512
            bufsize = bufsize + newsize
1527
1513
            self._tsize = newsize
1528
1514
            self._amount_read = self._amount_read + newsize
1529
1515
            #if self.opts.progress_obj:
1530
1516
            #    self.opts.progress_obj.update(self._amount_read)
1531
1517
1532
1518
        self._rbuf = string.join(buf, '')
1533
1519
        return
1534
1520
1535
1521
    def _progress_update(self, download_total, downloaded, upload_total, uploaded):
1536
1522
        if self._over_max_size(cur=self._amount_read-self._reget_length):
1537
1523
            return -1
1538
1524
1539
1525
        try:
1540
1526
            if self._prog_running:
1541
1527
                downloaded += self._reget_length
1542
1528
                self.opts.progress_obj.update(downloaded)
1543
1529
        except KeyboardInterrupt:
1544
1530
            return -1
1545
1531
    
1546
1532
    def _over_max_size(self, cur, max_size=None):
1547
1533
1548
1534
        if not max_size:
1549
1535
            max_size = self.size
1550
1536
        if self.opts.size: # if we set an opts size use that, no matter what
1551
1537
            max_size = self.opts.size
1552
1538
        if not max_size: return False # if we have None for all of the Max then this is dumb
1553
1539
        if cur > max_size + max_size*.10:
1554
1540
1555
1541
            msg = _("Downloaded more than max size for %s: %s > %s") \
1556
1542
                        % (self.url, cur, max_size)
1557
1543
            self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
1558
1544
            return True
1559
1545
        return False
1560
1546
        
1561
1547
    def _to_utf8(self, obj, errors='replace'):
1562
1548
        '''convert 'unicode' to an encoded utf-8 byte string '''
1563
1549
        # stolen from yum.i18n
1564
1550
        if isinstance(obj, unicode):
1565
1551
            obj = obj.encode('utf-8', errors)
1566
1552
        return obj
1567
1553
        
1568
1554
    def read(self, amt=None):
1569
1555
        self._fill_buffer(amt)
1570
1556
        if amt is None:
1571
1557
            s, self._rbuf = self._rbuf, ''
1572
1558
        else:
1573
1559
            s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
1574
1560
        return s
1575
1561
1576
1562
    def readline(self, limit=-1):
1577
1563
        if not self._complete: self._do_grab()
1578
1564
        return self.fo.readline()
1579
1565
        
1580
1566
        i = string.find(self._rbuf, '\n')
1581
1567
        while i < 0 and not (0 < limit <= len(self._rbuf)):
1582
1568
            L = len(self._rbuf)
1583
1569
            self._fill_buffer(L + self._rbufsize)
1584
1570
            if not len(self._rbuf) > L: break
1585
1571
            i = string.find(self._rbuf, '\n', L)
1586
1572
1587
1573
        if i < 0: i = len(self._rbuf)
1588
1574
        else: i = i+1
1589
1575
        if 0 <= limit < len(self._rbuf): i = limit
1590
1576
1591
1577
        s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
1592
1578
        return s
1593
1579
1594
1580
    def close(self):
1595
1581
        if self._prog_running:
1596
1582
            self.opts.progress_obj.end(self._amount_read)
1597
1583
        self.fo.close()
1598
1584
        
1599
1585
1600
1586
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
1601
1587
1602
1588
1603
1589
#####################################################################
1604
1590
# DEPRECATED FUNCTIONS
1605
1591
def set_throttle(new_throttle):
1606
1592
    """Deprecated. Use: default_grabber.throttle = new_throttle"""
1607
1593
    default_grabber.throttle = new_throttle
1608
1594
1609
1595
def set_bandwidth(new_bandwidth):
1610
1596
    """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
1611
1597
    default_grabber.bandwidth = new_bandwidth
1612
1598
1613
1599
def set_progress_obj(new_progress_obj):
1614
1600
    """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
1615
1601
    default_grabber.progress_obj = new_progress_obj
1616
1602
1617
1603
def set_user_agent(new_user_agent):
1618
1604
    """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
1619
1605
    default_grabber.user_agent = new_user_agent
1620
1606
    
1621
1607
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1622
1608
              progress_obj=None, throttle=None, bandwidth=None,
1623
1609
              numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
1624
1610
    """Deprecated. Use: urlgrab() with the retry arg instead"""
1625
1611
    kwargs = {'copy_local' :  copy_local, 
1626
1612
              'close_connection' : close_connection,
1627
1613
              'progress_obj' : progress_obj, 
1628
1614
              'throttle' : throttle, 
1629
1615
              'bandwidth' : bandwidth,
1630
1616
              'retry' : numtries,
1631
1617
              'retrycodes' : retrycodes,
1632
1618
              'checkfunc' : checkfunc 
1633
1619
              }
1634
1620
    return urlgrab(url, filename, **kwargs)
1635
1621
1636
1622
        
1637
1623
#####################################################################
1638
1624
#  TESTING
1639
1625
def _main_test():
1640
1626
    try: url, filename = sys.argv[1:3]
1641
1627
    except ValueError:
1642
1628
        print 'usage:', sys.argv[0], \
1643
1629
              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1644
1630
        sys.exit()
1645
1631
1646
1632
    kwargs = {}
1647
1633
    for a in sys.argv[3:]:
1648
1634
        k, v = string.split(a, '=', 1)
1649
1635
        kwargs[k] = int(v)
1650
1636
1651
1637
    set_throttle(1.0)
1652
1638
    set_bandwidth(32 * 1024)
1653
1639
    print "throttle: %s,  throttle bandwidth: %s B/s" % (default_grabber.throttle, 
1654
1640
                                                        default_grabber.bandwidth)
1655
1641
1656
1642
    try: from progress import text_progress_meter
1657
1643
    except ImportError, e: pass
1658
1644
    else: kwargs['progress_obj'] = text_progress_meter()
1659
1645
1660
1646
    try: name = apply(urlgrab, (url, filename), kwargs)
1661
1647
    except URLGrabError, e: print e
1662
1648
    else: print 'LOCAL FILE:', name
1663
1649
1664
1650
1665
1651
def _retry_test():
1666
1652
    try: url, filename = sys.argv[1:3]
1667
1653
    except ValueError:
1668
1654
        print 'usage:', sys.argv[0], \
1669
1655
              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1670
1656
        sys.exit()
1671
1657
1672
1658
    kwargs = {}
1673
1659
    for a in sys.argv[3:]:
1674
1660
        k, v = string.split(a, '=', 1)
1675
1661
        kwargs[k] = int(v)
1676
1662
1677
1663
    try: from progress import text_progress_meter
1678
1664
    except ImportError, e: pass
1679
1665
    else: kwargs['progress_obj'] = text_progress_meter()
1680
1666
1681
1667
    def cfunc(filename, hello, there='foo'):
1682
1668
        print hello, there
1683
1669
        import random
1684
1670
        rnum = random.random()
1685
1671
        if rnum < .5:
1686
1672
            print 'forcing retry'
1687
1673
            raise URLGrabError(-1, 'forcing retry')
1688
1674
        if rnum < .75:
1689
1675
            print 'forcing failure'
1690
1676
            raise URLGrabError(-2, 'forcing immediate failure')
1691
1677
        print 'success'
1692
1678
        return
1693
1679
        
1694
1680
    kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
1695
1681
    try: name = apply(retrygrab, (url, filename), kwargs)
1696
1682
    except URLGrabError, e: print e
1697
1683
    else: print 'LOCAL FILE:', name
1698
1684
1699
1685
def _file_object_test(filename=None):
1700
1686
    import cStringIO
1701
1687
    if filename is None:
1702
1688
        filename = __file__
1703
1689
    print 'using file "%s" for comparisons' % filename
1704
1690
    fo = open(filename)
1705
1691
    s_input = fo.read()
1706
1692
    fo.close()
1707
1693
1708
1694
    for testfunc in [_test_file_object_smallread,
1709
1695
                     _test_file_object_readall,
1710
1696
                     _test_file_object_readline,
1711
1697
                     _test_file_object_readlines]:
1712
1698
        fo_input = cStringIO.StringIO(s_input)
1713
1699
        fo_output = cStringIO.StringIO()
1714
1700
        wrapper = PyCurlFileObject(fo_input, None, 0)
1715
1701
        print 'testing %-30s ' % testfunc.__name__,
1716
1702
        testfunc(wrapper, fo_output)
1717
1703
        s_output = fo_output.getvalue()
1718
1704
        if s_output == s_input: print 'passed'
1719
1705
        else: print 'FAILED'
1720
1706
            
1721
1707
def _test_file_object_smallread(wrapper, fo_output):
1722
1708
    while 1:
1723
1709
        s = wrapper.read(23)
1724
1710
        fo_output.write(s)
1725
1711
        if not s: return
1726
1712
1727
1713
def _test_file_object_readall(wrapper, fo_output):
1728
1714
    s = wrapper.read()
1729
1715
    fo_output.write(s)
1730
1716
1731
1717
def _test_file_object_readline(wrapper, fo_output):
1732
1718
    while 1:
1733
1719
        s = wrapper.readline()
1734
1720
        fo_output.write(s)
1735
1721
        if not s: return
1736
1722
1737
1723
def _test_file_object_readlines(wrapper, fo_output):
1738
1724
    li = wrapper.readlines()
1739
1725
    fo_output.write(string.join(li, ''))
1740
1726
1741
1727
if __name__ == '__main__':
1742
1728
    _main_test()
1743
1729
    _retry_test()
1744
1730
    _file_object_test('test')
1745
1731
0
1746
=== removed directory '.pc/progress_fix.diff'
1747
=== removed directory '.pc/progress_fix.diff/urlgrabber'
1748
=== removed file '.pc/progress_fix.diff/urlgrabber/progress.py'
1749
--- .pc/progress_fix.diff/urlgrabber/progress.py	2010-07-08 17:40:08 +0000
1750
+++ .pc/progress_fix.diff/urlgrabber/progress.py	1970-01-01 00:00:00 +0000
1751
@@ -1,755 +0,0 @@
1752
1
#   This library is free software; you can redistribute it and/or
1753
2
#   modify it under the terms of the GNU Lesser General Public
1754
3
#   License as published by the Free Software Foundation; either
1755
4
#   version 2.1 of the License, or (at your option) any later version.
1756
5
#
1757
6
#   This library is distributed in the hope that it will be useful,
1758
7
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
1759
8
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1760
9
#   Lesser General Public License for more details.
1761
10
#
1762
11
#   You should have received a copy of the GNU Lesser General Public
1763
12
#   License along with this library; if not, write to the 
1764
13
#      Free Software Foundation, Inc., 
1765
14
#      59 Temple Place, Suite 330, 
1766
15
#      Boston, MA  02111-1307  USA
1767
16
1768
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
1769
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
1770
19
1771
20
1772
21
import sys
1773
22
import time
1774
23
import math
1775
24
import thread
1776
25
import fcntl
1777
26
import struct
1778
27
import termios
1779
28
1780
29
# Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html
1781
30
def terminal_width(fd=1):
1782
31
    """ Get the real terminal width """
1783
32
    try:
1784
33
        buf = 'abcdefgh'
1785
34
        buf = fcntl.ioctl(fd, termios.TIOCGWINSZ, buf)
1786
35
        ret = struct.unpack('hhhh', buf)[1]
1787
36
        if ret == 0:
1788
37
            return 80
1789
38
        # Add minimum too?
1790
39
        return ret
1791
40
    except: # IOError
1792
41
        return 80
1793
42
1794
43
_term_width_val  = None
1795
44
_term_width_last = None
1796
45
def terminal_width_cached(fd=1, cache_timeout=1.000):
1797
46
    """ Get the real terminal width, but cache it for a bit. """
1798
47
    global _term_width_val
1799
48
    global _term_width_last
1800
49
1801
50
    now = time.time()
1802
51
    if _term_width_val is None or (now - _term_width_last) > cache_timeout:
1803
52
        _term_width_val  = terminal_width(fd)
1804
53
        _term_width_last = now
1805
54
    return _term_width_val
1806
55
1807
56
class TerminalLine:
1808
57
    """ Help create dynamic progress bars, uses terminal_width_cached(). """
1809
58
1810
59
    def __init__(self, min_rest=0, beg_len=None, fd=1, cache_timeout=1.000):
1811
60
        if beg_len is None:
1812
61
            beg_len = min_rest
1813
62
        self._min_len = min_rest
1814
63
        self._llen    = terminal_width_cached(fd, cache_timeout)
1815
64
        if self._llen < beg_len:
1816
65
            self._llen = beg_len
1817
66
        self._fin = False
1818
67
1819
68
    def __len__(self):
1820
69
        """ Usable length for elements. """
1821
70
        return self._llen - self._min_len
1822
71
1823
72
    def rest_split(self, fixed, elements=2):
1824
73
        """ After a fixed length, split the rest of the line length among
1825
74
            a number of different elements (default=2). """
1826
75
        if self._llen < fixed:
1827
76
            return 0
1828
77
        return (self._llen - fixed) / elements
1829
78
1830
79
    def add(self, element, full_len=None):
1831
80
        """ If there is room left in the line, above min_len, add element.
1832
81
            Note that as soon as one add fails all the rest will fail too. """
1833
82
1834
83
        if full_len is None:
1835
84
            full_len = len(element)
1836
85
        if len(self) < full_len:
1837
86
            self._fin = True
1838
87
        if self._fin:
1839
88
            return ''
1840
89
1841
90
        self._llen -= len(element)
1842
91
        return element
1843
92
1844
93
    def rest(self):
1845
94
        """ Current rest of line, same as .rest_split(fixed=0, elements=1). """
1846
95
        return self._llen
1847
96
1848
97
class BaseMeter:
1849
98
    def __init__(self):
1850
99
        self.update_period = 0.3 # seconds
1851
100
1852
101
        self.filename   = None
1853
102
        self.url        = None
1854
103
        self.basename   = None
1855
104
        self.text       = None
1856
105
        self.size       = None
1857
106
        self.start_time = None
1858
107
        self.last_amount_read = 0
1859
108
        self.last_update_time = None
1860
109
        self.re = RateEstimator()
1861
110
        
1862
111
    def start(self, filename=None, url=None, basename=None,
1863
112
              size=None, now=None, text=None):
1864
113
        self.filename = filename
1865
114
        self.url      = url
1866
115
        self.basename = basename
1867
116
        self.text     = text
1868
117
1869
118
        #size = None #########  TESTING
1870
119
        self.size = size
1871
120
        if not size is None: self.fsize = format_number(size) + 'B'
1872
121
1873
122
        if now is None: now = time.time()
1874
123
        self.start_time = now
1875
124
        self.re.start(size, now)
1876
125
        self.last_amount_read = 0
1877
126
        self.last_update_time = now
1878
127
        self._do_start(now)
1879
128
        
1880
129
    def _do_start(self, now=None):
1881
130
        pass
1882
131
1883
132
    def update(self, amount_read, now=None):
1884
133
        # for a real gui, you probably want to override and put a call
1885
134
        # to your mainloop iteration function here
1886
135
        if now is None: now = time.time()
1887
136
        if (now >= self.last_update_time + self.update_period) or \
1888
137
               not self.last_update_time:
1889
138
            self.re.update(amount_read, now)
1890
139
            self.last_amount_read = amount_read
1891
140
            self.last_update_time = now
1892
141
            self._do_update(amount_read, now)
1893
142
1894
143
    def _do_update(self, amount_read, now=None):
1895
144
        pass
1896
145
1897
146
    def end(self, amount_read, now=None):
1898
147
        if now is None: now = time.time()
1899
148
        self.re.update(amount_read, now)
1900
149
        self.last_amount_read = amount_read
1901
150
        self.last_update_time = now
1902
151
        self._do_end(amount_read, now)
1903
152
1904
153
    def _do_end(self, amount_read, now=None):
1905
154
        pass
1906
155
        
1907
156
#  This is kind of a hack, but progress is gotten from grabber which doesn't
1908
157
# know about the total size to download. So we do this so we can get the data
1909
158
# out of band here. This will be "fixed" one way or anther soon.
1910
159
_text_meter_total_size = 0
1911
160
_text_meter_sofar_size = 0
1912
161
def text_meter_total_size(size, downloaded=0):
1913
162
    global _text_meter_total_size
1914
163
    global _text_meter_sofar_size
1915
164
    _text_meter_total_size = size
1916
165
    _text_meter_sofar_size = downloaded
1917
166
1918
167
#
1919
168
#       update: No size (minimal: 17 chars)
1920
169
#       -----------------------------------
1921
170
# <text>                          <rate> | <current size> <elapsed time> 
1922
171
#  8-48                          1    8  3             6 1            9 5
1923
172
#
1924
173
# Order: 1. <text>+<current size> (17)
1925
174
#        2. +<elapsed time>       (10, total: 27)
1926
175
#        3. +                     ( 5, total: 32)
1927
176
#        4. +<rate>               ( 9, total: 41)
1928
177
#
1929
178
#       update: Size, Single file
1930
179
#       -------------------------
1931
180
# <text>            <pc>  <bar> <rate> | <current size> <eta time> ETA
1932
181
#  8-25            1 3-4 1 6-16 1   8  3             6 1        9 1  3 1
1933
182
#
1934
183
# Order: 1. <text>+<current size> (17)
1935
184
#        2. +<eta time>           (10, total: 27)
1936
185
#        3. +ETA                  ( 5, total: 32)
1937
186
#        4. +<pc>                 ( 4, total: 36)
1938
187
#        5. +<rate>               ( 9, total: 45)
1939
188
#        6. +<bar>                ( 7, total: 52)
1940
189
#
1941
190
#       update: Size, All files
1942
191
#       -----------------------
1943
192
# <text> <total pc> <pc>  <bar> <rate> | <current size> <eta time> ETA
1944
193
#  8-22 1      5-7 1 3-4 1 6-12 1   8  3             6 1        9 1  3 1
1945
194
#
1946
195
# Order: 1. <text>+<current size> (17)
1947
196
#        2. +<eta time>           (10, total: 27)
1948
197
#        3. +ETA                  ( 5, total: 32)
1949
198
#        4. +<total pc>           ( 5, total: 37)
1950
199
#        4. +<pc>                 ( 4, total: 41)
1951
200
#        5. +<rate>               ( 9, total: 50)
1952
201
#        6. +<bar>                ( 7, total: 57)
1953
202
#
1954
203
#       end
1955
204
#       ---
1956
205
# <text>                                 | <current size> <elapsed time> 
1957
206
#  8-56                                  3             6 1            9 5
1958
207
#
1959
208
# Order: 1. <text>                ( 8)
1960
209
#        2. +<current size>       ( 9, total: 17)
1961
210
#        3. +<elapsed time>       (10, total: 27)
1962
211
#        4. +                     ( 5, total: 32)
1963
212
#
1964
213
1965
214
class TextMeter(BaseMeter):
1966
215
    def __init__(self, fo=sys.stderr):
1967
216
        BaseMeter.__init__(self)
1968
217
        self.fo = fo
1969
218
1970
219
    def _do_update(self, amount_read, now=None):
1971
220
        etime = self.re.elapsed_time()
1972
221
        fetime = format_time(etime)
1973
222
        fread = format_number(amount_read)
1974
223
        #self.size = None
1975
224
        if self.text is not None:
1976
225
            text = self.text
1977
226
        else:
1978
227
            text = self.basename
1979
228
1980
229
        ave_dl = format_number(self.re.average_rate())
1981
230
        sofar_size = None
1982
231
        if _text_meter_total_size:
1983
232
            sofar_size = _text_meter_sofar_size + amount_read
1984
233
            sofar_pc   = (sofar_size * 100) / _text_meter_total_size
1985
234
1986
235
        # Include text + ui_rate in minimal
1987
236
        tl = TerminalLine(8, 8+1+8)
1988
237
        ui_size = tl.add(' | %5sB' % fread)
1989
238
        if self.size is None:
1990
239
            ui_time = tl.add(' %9s' % fetime)
1991
240
            ui_end  = tl.add(' ' * 5)
1992
241
            ui_rate = tl.add(' %5sB/s' % ave_dl)
1993
242
            out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
1994
243
                                        ui_rate, ui_size, ui_time, ui_end)
1995
244
        else:
1996
245
            rtime = self.re.remaining_time()
1997
246
            frtime = format_time(rtime)
1998
247
            frac = self.re.fraction_read()
1999
248
2000
249
            ui_time = tl.add(' %9s' % frtime)
2001
250
            ui_end  = tl.add(' ETA ')
2002
251
2003
252
            if sofar_size is None:
2004
253
                ui_sofar_pc = ''
2005
254
            else:
2006
255
                ui_sofar_pc = tl.add(' (%i%%)' % sofar_pc,
2007
256
                                     full_len=len(" (100%)"))
2008
257
2009
258
            ui_pc   = tl.add(' %2i%%' % (frac*100))
2010
259
            ui_rate = tl.add(' %5sB/s' % ave_dl)
2011
260
            # Make text grow a bit before we start growing the bar too
2012
261
            blen = 4 + tl.rest_split(8 + 8 + 4)
2013
262
            bar  = '='*int(blen * frac)
2014
263
            if (blen * frac) - int(blen * frac) >= 0.5:
2015
264
                bar += '-'
2016
265
            ui_bar  = tl.add(' [%-*.*s]' % (blen, blen, bar))
2017
266
            out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
2018
267
                                              ui_sofar_pc, ui_pc, ui_bar,
2019
268
                                              ui_rate, ui_size, ui_time, ui_end)
2020
269
2021
270
        self.fo.write(out)
2022
271
        self.fo.flush()
2023
272
2024
273
    def _do_end(self, amount_read, now=None):
2025
274
        global _text_meter_total_size
2026
275
        global _text_meter_sofar_size
2027
276
2028
277
        total_time = format_time(self.re.elapsed_time())
2029
278
        total_size = format_number(amount_read)
2030
279
        if self.text is not None:
2031
280
            text = self.text
2032
281
        else:
2033
282
            text = self.basename
2034
283
2035
284
        tl = TerminalLine(8)
2036
285
        ui_size = tl.add(' | %5sB' % total_size)
2037
286
        ui_time = tl.add(' %9s' % total_time)
2038
287
        not_done = self.size is not None and amount_read != self.size
2039
288
        if not_done:
2040
289
            ui_end  = tl.add(' ... ')
2041
290
        else:
2042
291
            ui_end  = tl.add(' ' * 5)
2043
292
2044
293
        out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
2045
294
                                    ui_size, ui_time, ui_end)
2046
295
        self.fo.write(out)
2047
296
        self.fo.flush()
2048
297
2049
298
        # Don't add size to the sofar size until we have all of it.
2050
299
        # If we don't have a size, then just pretend/hope we got all of it.
2051
300
        if not_done:
2052
301
            return
2053
302
2054
303
        if _text_meter_total_size:
2055
304
            _text_meter_sofar_size += amount_read
2056
305
        if _text_meter_total_size <= _text_meter_sofar_size:
2057
306
            _text_meter_total_size = 0
2058
307
            _text_meter_sofar_size = 0
2059
308
2060
309
text_progress_meter = TextMeter
2061
310
2062
311
class MultiFileHelper(BaseMeter):
2063
312
    def __init__(self, master):
2064
313
        BaseMeter.__init__(self)
2065
314
        self.master = master
2066
315
2067
316
    def _do_start(self, now):
2068
317
        self.master.start_meter(self, now)
2069
318
2070
319
    def _do_update(self, amount_read, now):
2071
320
        # elapsed time since last update
2072
321
        self.master.update_meter(self, now)
2073
322
2074
323
    def _do_end(self, amount_read, now):
2075
324
        self.ftotal_time = format_time(now - self.start_time)
2076
325
        self.ftotal_size = format_number(self.last_amount_read)
2077
326
        self.master.end_meter(self, now)
2078
327
2079
328
    def failure(self, message, now=None):
2080
329
        self.master.failure_meter(self, message, now)
2081
330
2082
331
    def message(self, message):
2083
332
        self.master.message_meter(self, message)
2084
333
2085
334
class MultiFileMeter:
2086
335
    helperclass = MultiFileHelper
2087
336
    def __init__(self):
2088
337
        self.meters = []
2089
338
        self.in_progress_meters = []
2090
339
        self._lock = thread.allocate_lock()
2091
340
        self.update_period = 0.3 # seconds
2092
341
        
2093
342
        self.numfiles         = None
2094
343
        self.finished_files   = 0
2095
344
        self.failed_files     = 0
2096
345
        self.open_files       = 0
2097
346
        self.total_size       = None
2098
347
        self.failed_size      = 0
2099
348
        self.start_time       = None
2100
349
        self.finished_file_size = 0
2101
350
        self.last_update_time = None
2102
351
        self.re = RateEstimator()
2103
352
2104
353
    def start(self, numfiles=None, total_size=None, now=None):
2105
354
        if now is None: now = time.time()
2106
355
        self.numfiles         = numfiles
2107
356
        self.finished_files   = 0
2108
357
        self.failed_files     = 0
2109
358
        self.open_files       = 0
2110
359
        self.total_size       = total_size
2111
360
        self.failed_size      = 0
2112
361
        self.start_time       = now
2113
362
        self.finished_file_size = 0
2114
363
        self.last_update_time = now
2115
364
        self.re.start(total_size, now)
2116
365
        self._do_start(now)
2117
366
2118
367
    def _do_start(self, now):
2119
368
        pass
2120
369
2121
370
    def end(self, now=None):
2122
371
        if now is None: now = time.time()
2123
372
        self._do_end(now)
2124
373
        
2125
374
    def _do_end(self, now):
2126
375
        pass
2127
376
2128
377
    def lock(self): self._lock.acquire()
2129
378
    def unlock(self): self._lock.release()
2130
379
2131
380
    ###########################################################
2132
381
    # child meter creation and destruction
2133
382
    def newMeter(self):
2134
383
        newmeter = self.helperclass(self)
2135
384
        self.meters.append(newmeter)
2136
385
        return newmeter
2137
386
    
2138
387
    def removeMeter(self, meter):
2139
388
        self.meters.remove(meter)
2140
389
        
2141
390
    ###########################################################
2142
391
    # child functions - these should only be called by helpers
2143
392
    def start_meter(self, meter, now):
2144
393
        if not meter in self.meters:
2145
394
            raise ValueError('attempt to use orphaned meter')
2146
395
        self._lock.acquire()
2147
396
        try:
2148
397
            if not meter in self.in_progress_meters:
2149
398
                self.in_progress_meters.append(meter)
2150
399
                self.open_files += 1
2151
400
        finally:
2152
401
            self._lock.release()
2153
402
        self._do_start_meter(meter, now)
2154
403
        
2155
404
    def _do_start_meter(self, meter, now):
2156
405
        pass
2157
406
        
2158
407
    def update_meter(self, meter, now):
2159
408
        if not meter in self.meters:
2160
409
            raise ValueError('attempt to use orphaned meter')
2161
410
        if (now >= self.last_update_time + self.update_period) or \
2162
411
               not self.last_update_time:
2163
412
            self.re.update(self._amount_read(), now)
2164
413
            self.last_update_time = now
2165
414
            self._do_update_meter(meter, now)
2166
415
2167
416
    def _do_update_meter(self, meter, now):
2168
417
        pass
2169
418
2170
419
    def end_meter(self, meter, now):
2171
420
        if not meter in self.meters:
2172
421
            raise ValueError('attempt to use orphaned meter')
2173
422
        self._lock.acquire()
2174
423
        try:
2175
424
            try: self.in_progress_meters.remove(meter)
2176
425
            except ValueError: pass
2177
426
            self.open_files     -= 1
2178
427
            self.finished_files += 1
2179
428
            self.finished_file_size += meter.last_amount_read
2180
429
        finally:
2181
430
            self._lock.release()
2182
431
        self._do_end_meter(meter, now)
2183
432
2184
433
    def _do_end_meter(self, meter, now):
2185
434
        pass
2186
435
2187
436
    def failure_meter(self, meter, message, now):
2188
437
        if not meter in self.meters:
2189
438
            raise ValueError('attempt to use orphaned meter')
2190
439
        self._lock.acquire()
2191
440
        try:
2192
441
            try: self.in_progress_meters.remove(meter)
2193
442
            except ValueError: pass
2194
443
            self.open_files     -= 1
2195
444
            self.failed_files   += 1
2196
445
            if meter.size and self.failed_size is not None:
2197
446
                self.failed_size += meter.size
2198
447
            else:
2199
448
                self.failed_size = None
2200
449
        finally:
2201
450
            self._lock.release()
2202
451
        self._do_failure_meter(meter, message, now)
2203
452
2204
453
    def _do_failure_meter(self, meter, message, now):
2205
454
        pass
2206
455
2207
456
    def message_meter(self, meter, message):
2208
457
        pass
2209
458
2210
459
    ########################################################
2211
460
    # internal functions
2212
461
    def _amount_read(self):
2213
462
        tot = self.finished_file_size
2214
463
        for m in self.in_progress_meters:
2215
464
            tot += m.last_amount_read
2216
465
        return tot
2217
466
2218
467
2219
468
class TextMultiFileMeter(MultiFileMeter):
2220
469
    def __init__(self, fo=sys.stderr):
2221
470
        self.fo = fo
2222
471
        MultiFileMeter.__init__(self)
2223
472
2224
473
    # files: ###/### ###%  data: ######/###### ###%  time: ##:##:##/##:##:##
2225
474
    def _do_update_meter(self, meter, now):
2226
475
        self._lock.acquire()
2227
476
        try:
2228
477
            format = "files: %3i/%-3i %3i%%   data: %6.6s/%-6.6s %3i%%   " \
2229
478
                     "time: %8.8s/%8.8s"
2230
479
            df = self.finished_files
2231
480
            tf = self.numfiles or 1
2232
481
            pf = 100 * float(df)/tf + 0.49
2233
482
            dd = self.re.last_amount_read
2234
483
            td = self.total_size
2235
484
            pd = 100 * (self.re.fraction_read() or 0) + 0.49
2236
485
            dt = self.re.elapsed_time()
2237
486
            rt = self.re.remaining_time()
2238
487
            if rt is None: tt = None
2239
488
            else: tt = dt + rt
2240
489
2241
490
            fdd = format_number(dd) + 'B'
2242
491
            ftd = format_number(td) + 'B'
2243
492
            fdt = format_time(dt, 1)
2244
493
            ftt = format_time(tt, 1)
2245
494
            
2246
495
            out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
2247
496
            self.fo.write('\r' + out)
2248
497
            self.fo.flush()
2249
498
        finally:
2250
499
            self._lock.release()
2251
500
2252
501
    def _do_end_meter(self, meter, now):
2253
502
        self._lock.acquire()
2254
503
        try:
2255
504
            format = "%-30.30s %6.6s    %8.8s    %9.9s"
2256
505
            fn = meter.basename
2257
506
            size = meter.last_amount_read
2258
507
            fsize = format_number(size) + 'B'
2259
508
            et = meter.re.elapsed_time()
2260
509
            fet = format_time(et, 1)
2261
510
            frate = format_number(size / et) + 'B/s'
2262
511
            
2263
512
            out = '%-79.79s' % (format % (fn, fsize, fet, frate))
2264
513
            self.fo.write('\r' + out + '\n')
2265
514
        finally:
2266
515
            self._lock.release()
2267
516
        self._do_update_meter(meter, now)
2268
517
2269
518
    def _do_failure_meter(self, meter, message, now):
2270
519
        self._lock.acquire()
2271
520
        try:
2272
521
            format = "%-30.30s %6.6s %s"
2273
522
            fn = meter.basename
2274
523
            if type(message) in (type(''), type(u'')):
2275
524
                message = message.splitlines()
2276
525
            if not message: message = ['']
2277
526
            out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
2278
527
            self.fo.write('\r' + out + '\n')
2279
528
            for m in message[1:]: self.fo.write('  ' + m + '\n')
2280
529
            self._lock.release()
2281
530
        finally:
2282
531
            self._do_update_meter(meter, now)
2283
532
2284
533
    def message_meter(self, meter, message):
2285
534
        self._lock.acquire()
2286
535
        try:
2287
536
            pass
2288
537
        finally:
2289
538
            self._lock.release()
2290
539
2291
540
    def _do_end(self, now):
2292
541
        self._do_update_meter(None, now)
2293
542
        self._lock.acquire()
2294
543
        try:
2295
544
            self.fo.write('\n')
2296
545
            self.fo.flush()
2297
546
        finally:
2298
547
            self._lock.release()
2299
548
        
2300
549
######################################################################
2301
550
# support classes and functions
2302
551
2303
552
class RateEstimator:
2304
553
    def __init__(self, timescale=5.0):
2305
554
        self.timescale = timescale
2306
555
2307
556
    def start(self, total=None, now=None):
2308
557
        if now is None: now = time.time()
2309
558
        self.total = total
2310
559
        self.start_time = now
2311
560
        self.last_update_time = now
2312
561
        self.last_amount_read = 0
2313
562
        self.ave_rate = None
2314
563
        
2315
564
    def update(self, amount_read, now=None):
2316
565
        if now is None: now = time.time()
2317
566
        if amount_read == 0:
2318
567
            # if we just started this file, all bets are off
2319
568
            self.last_update_time = now
2320
569
            self.last_amount_read = 0
2321
570
            self.ave_rate = None
2322
571
            return
2323
572
2324
573
        #print 'times', now, self.last_update_time
2325
574
        time_diff = now         - self.last_update_time
2326
575
        read_diff = amount_read - self.last_amount_read
2327
576
        # First update, on reget is the file size
2328
577
        if self.last_amount_read:
2329
578
            self.last_update_time = now
2330
579
            self.ave_rate = self._temporal_rolling_ave(\
2331
580
                time_diff, read_diff, self.ave_rate, self.timescale)
2332
581
        self.last_amount_read = amount_read
2333
582
        #print 'results', time_diff, read_diff, self.ave_rate
2334
583
        
2335
584
    #####################################################################
2336
585
    # result methods
2337
586
    def average_rate(self):
2338
587
        "get the average transfer rate (in bytes/second)"
2339
588
        return self.ave_rate
2340
589
2341
590
    def elapsed_time(self):
2342
591
        "the time between the start of the transfer and the most recent update"
2343
592
        return self.last_update_time - self.start_time
2344
593
2345
594
    def remaining_time(self):
2346
595
        "estimated time remaining"
2347
596
        if not self.ave_rate or not self.total: return None
2348
597
        return (self.total - self.last_amount_read) / self.ave_rate
2349
598
2350
599
    def fraction_read(self):
2351
600
        """the fraction of the data that has been read
2352
601
        (can be None for unknown transfer size)"""
2353
602
        if self.total is None: return None
2354
603
        elif self.total == 0: return 1.0
2355
604
        else: return float(self.last_amount_read)/self.total
2356
605
2357
606
    #########################################################################
2358
607
    # support methods
2359
608
    def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale):
2360
609
        """a temporal rolling average performs smooth averaging even when
2361
610
        updates come at irregular intervals.  This is performed by scaling
2362
611
        the "epsilon" according to the time since the last update.
2363
612
        Specifically, epsilon = time_diff / timescale
2364
613
2365
614
        As a general rule, the average will take on a completely new value
2366
615
        after 'timescale' seconds."""
2367
616
        epsilon = time_diff / timescale
2368
617
        if epsilon > 1: epsilon = 1.0
2369
618
        return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
2370
619
    
2371
620
    def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
2372
621
        """perform a "rolling average" iteration
2373
622
        a rolling average "folds" new data into an existing average with
2374
623
        some weight, epsilon.  epsilon must be between 0.0 and 1.0 (inclusive)
2375
624
        a value of 0.0 means only the old value (initial value) counts,
2376
625
        and a value of 1.0 means only the newest value is considered."""
2377
626
        
2378
627
        try:
2379
628
            recent_rate = read_diff / time_diff
2380
629
        except ZeroDivisionError:
2381
630
            recent_rate = None
2382
631
        if last_ave is None: return recent_rate
2383
632
        elif recent_rate is None: return last_ave
2384
633
2385
634
        # at this point, both last_ave and recent_rate are numbers
2386
635
        return epsilon * recent_rate  +  (1 - epsilon) * last_ave
2387
636
2388
637
    def _round_remaining_time(self, rt, start_time=15.0):
2389
638
        """round the remaining time, depending on its size
2390
639
        If rt is between n*start_time and (n+1)*start_time round downward
2391
640
        to the nearest multiple of n (for any counting number n).
2392
641
        If rt < start_time, round down to the nearest 1.
2393
642
        For example (for start_time = 15.0):
2394
643
         2.7  -> 2.0
2395
644
         25.2 -> 25.0
2396
645
         26.4 -> 26.0
2397
646
         35.3 -> 34.0
2398
647
         63.6 -> 60.0
2399
648
        """
2400
649
2401
650
        if rt < 0: return 0.0
2402
651
        shift = int(math.log(rt/start_time)/math.log(2))
2403
652
        rt = int(rt)
2404
653
        if shift <= 0: return rt
2405
654
        return float(int(rt) >> shift << shift)
2406
655
        
2407
656
2408
657
def format_time(seconds, use_hours=0):
2409
658
    if seconds is None or seconds < 0:
2410
659
        if use_hours: return '--:--:--'
2411
660
        else:         return '--:--'
2412
661
    else:
2413
662
        seconds = int(seconds)
2414
663
        minutes = seconds / 60
2415
664
        seconds = seconds % 60
2416
665
        if use_hours:
2417
666
            hours = minutes / 60
2418
667
            minutes = minutes % 60
2419
668
            return '%02i:%02i:%02i' % (hours, minutes, seconds)
2420
669
        else:
2421
670
            return '%02i:%02i' % (minutes, seconds)
2422
671
            
2423
672
def format_number(number, SI=0, space=' '):
2424
673
    """Turn numbers into human-readable metric-like numbers"""
2425
674
    symbols = ['',  # (none)
2426
675
               'k', # kilo
2427
676
               'M', # mega
2428
677
               'G', # giga
2429
678
               'T', # tera
2430
679
               'P', # peta
2431
680
               'E', # exa
2432
681
               'Z', # zetta
2433
682
               'Y'] # yotta
2434
683
    
2435
684
    if SI: step = 1000.0
2436
685
    else: step = 1024.0
2437
686
2438
687
    thresh = 999
2439
688
    depth = 0
2440
689
    max_depth = len(symbols) - 1
2441
690
    
2442
691
    # we want numbers between 0 and thresh, but don't exceed the length
2443
692
    # of our list.  In that event, the formatting will be screwed up,
2444
693
    # but it'll still show the right number.
2445
694
    while number > thresh and depth < max_depth:
2446
695
        depth  = depth + 1
2447
696
        number = number / step
2448
697
2449
698
    if type(number) == type(1) or type(number) == type(1L):
2450
699
        # it's an int or a long, which means it didn't get divided,
2451
700
        # which means it's already short enough
2452
701
        format = '%i%s%s'
2453
702
    elif number < 9.95:
2454
703
        # must use 9.95 for proper sizing.  For example, 9.99 will be
2455
704
        # rounded to 10.0 with the .1f format string (which is too long)
2456
705
        format = '%.1f%s%s'
2457
706
    else:
2458
707
        format = '%.0f%s%s'
2459
708
        
2460
709
    return(format % (float(number or 0), space, symbols[depth]))
2461
710
2462
711
def _tst(fn, cur, tot, beg, size, *args):
2463
712
    tm = TextMeter()
2464
713
    text = "(%d/%d): %s" % (cur, tot, fn)
2465
714
    tm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, text=text)
2466
715
    num = beg
2467
716
    off = 0
2468
717
    for (inc, delay) in args:
2469
718
        off += 1
2470
719
        while num < ((size * off) / len(args)):
2471
720
            num += inc
2472
721
            tm.update(num)
2473
722
            time.sleep(delay)
2474
723
    tm.end(size)
2475
724
2476
725
if __name__ == "__main__":
2477
726
    # (1/2): subversion-1.4.4-7.x86_64.rpm               2.4 MB /  85 kB/s    00:28     
2478
727
    # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm            924 kB / 106 kB/s    00:08     
2479
728
    if len(sys.argv) >= 2 and sys.argv[1] == 'total':
2480
729
        text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 +
2481
730
                              1000000 + 10000 + 10000 + 10000 + 1000000)
2482
731
    _tst("sm-1.0.0-1.fc8.i386.rpm", 1, 10, 0, 1000,
2483
732
         (10, 0.2), (10, 0.1), (100, 0.25))
2484
733
    _tst("s-1.0.1-1.fc8.i386.rpm", 2, 10, 0, 10000,
2485
734
         (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
2486
735
    _tst("m-1.0.1-2.fc8.i386.rpm", 3, 10, 5000, 10000,
2487
736
         (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
2488
737
    _tst("large-file-name-Foo-11.8.7-4.5.6.1.fc8.x86_64.rpm", 4, 10, 0, 1000000,
2489
738
         (1000, 0.2), (1000, 0.1), (10000, 0.1))
2490
739
    _tst("large-file-name-Foo2-11.8.7-4.5.6.2.fc8.x86_64.rpm", 5, 10,
2491
740
         500001, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1))
2492
741
    _tst("large-file-name-Foo3-11.8.7-4.5.6.3.fc8.x86_64.rpm", 6, 10,
2493
742
         750002, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1))
2494
743
    _tst("large-file-name-Foo4-10.8.7-4.5.6.1.fc8.x86_64.rpm", 7, 10, 0, 10000,
2495
744
         (100, 0.1))
2496
745
    _tst("large-file-name-Foo5-10.8.7-4.5.6.2.fc8.x86_64.rpm", 8, 10,
2497
746
         5001, 10000, (100, 0.1))
2498
747
    _tst("large-file-name-Foo6-10.8.7-4.5.6.3.fc8.x86_64.rpm", 9, 10,
2499
748
         7502, 10000, (1, 0.1))
2500
749
    _tst("large-file-name-Foox-9.8.7-4.5.6.1.fc8.x86_64.rpm",  10, 10,
2501
750
         0, 1000000, (10, 0.5),
2502
751
         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
2503
752
         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
2504
753
         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
2505
754
         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
2506
755
         (100000, 0.1), (1, 0.1))
2507
756
0
2508
=== removed directory '.pc/progress_object_callback_fix.diff'
2509
=== removed directory '.pc/progress_object_callback_fix.diff/urlgrabber'
2510
=== removed file '.pc/progress_object_callback_fix.diff/urlgrabber/grabber.py'
2511
--- .pc/progress_object_callback_fix.diff/urlgrabber/grabber.py	2011-08-09 17:45:08 +0000
2512
+++ .pc/progress_object_callback_fix.diff/urlgrabber/grabber.py	1970-01-01 00:00:00 +0000
2513
@@ -1,1802 +0,0 @@
2514
1
#   This library is free software; you can redistribute it and/or
2515
2
#   modify it under the terms of the GNU Lesser General Public
2516
3
#   License as published by the Free Software Foundation; either
2517
4
#   version 2.1 of the License, or (at your option) any later version.
2518
5
#
2519
6
#   This library is distributed in the hope that it will be useful,
2520
7
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
2521
8
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2522
9
#   Lesser General Public License for more details.
2523
10
#
2524
11
#   You should have received a copy of the GNU Lesser General Public
2525
12
#   License along with this library; if not, write to the 
2526
13
#      Free Software Foundation, Inc., 
2527
14
#      59 Temple Place, Suite 330, 
2528
15
#      Boston, MA  02111-1307  USA
2529
16
2530
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
2531
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
2532
19
# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
2533
20
2534
21
"""A high-level cross-protocol url-grabber.
2535
22
2536
23
GENERAL ARGUMENTS (kwargs)
2537
24
2538
25
  Where possible, the module-level default is indicated, and legal
2539
26
  values are provided.
2540
27
2541
28
  copy_local = 0   [0|1]
2542
29
2543
30
    ignored except for file:// urls, in which case it specifies
2544
31
    whether urlgrab should still make a copy of the file, or simply
2545
32
    point to the existing copy. The module level default for this
2546
33
    option is 0.
2547
34
2548
35
  close_connection = 0   [0|1]
2549
36
2550
37
    tells URLGrabber to close the connection after a file has been
2551
38
    transfered. This is ignored unless the download happens with the
2552
39
    http keepalive handler (keepalive=1).  Otherwise, the connection
2553
40
    is left open for further use. The module level default for this
2554
41
    option is 0 (keepalive connections will not be closed).
2555
42
2556
43
  keepalive = 1   [0|1]
2557
44
2558
45
    specifies whether keepalive should be used for HTTP/1.1 servers
2559
46
    that support it. The module level default for this option is 1
2560
47
    (keepalive is enabled).
2561
48
2562
49
  progress_obj = None
2563
50
2564
51
    a class instance that supports the following methods:
2565
52
      po.start(filename, url, basename, length, text)
2566
53
      # length will be None if unknown
2567
54
      po.update(read) # read == bytes read so far
2568
55
      po.end()
2569
56
2570
57
  text = None
2571
58
  
2572
59
    specifies alternative text to be passed to the progress meter
2573
60
    object.  If not given, the default progress meter will use the
2574
61
    basename of the file.
2575
62
2576
63
  throttle = 1.0
2577
64
2578
65
    a number - if it's an int, it's the bytes/second throttle limit.
2579
66
    If it's a float, it is first multiplied by bandwidth.  If throttle
2580
67
    == 0, throttling is disabled.  If None, the module-level default
2581
68
    (which can be set on default_grabber.throttle) is used. See
2582
69
    BANDWIDTH THROTTLING for more information.
2583
70
2584
71
  timeout = 300
2585
72
2586
73
    a positive integer expressing the number of seconds to wait before
2587
74
    timing out attempts to connect to a server. If the value is None
2588
75
    or 0, connection attempts will not time out. The timeout is passed
2589
76
    to the underlying pycurl object as its CONNECTTIMEOUT option, see
2590
77
    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
2591
78
    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
2592
79
2593
80
  bandwidth = 0
2594
81
2595
82
    the nominal max bandwidth in bytes/second.  If throttle is a float
2596
83
    and bandwidth == 0, throttling is disabled.  If None, the
2597
84
    module-level default (which can be set on
2598
85
    default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
2599
86
    more information.
2600
87
2601
88
  range = None
2602
89
2603
90
    a tuple of the form (first_byte, last_byte) describing a byte
2604
91
    range to retrieve. Either or both of the values may set to
2605
92
    None. If first_byte is None, byte offset 0 is assumed. If
2606
93
    last_byte is None, the last byte available is assumed. Note that
2607
94
    the range specification is python-like in that (0,10) will yeild
2608
95
    the first 10 bytes of the file.
2609
96
2610
97
    If set to None, no range will be used.
2611
98
    
2612
99
  reget = None   [None|'simple'|'check_timestamp']
2613
100
2614
101
    whether to attempt to reget a partially-downloaded file.  Reget
2615
102
    only applies to .urlgrab and (obviously) only if there is a
2616
103
    partially downloaded file.  Reget has two modes:
2617
104
2618
105
      'simple' -- the local file will always be trusted.  If there
2619
106
        are 100 bytes in the local file, then the download will always
2620
107
        begin 100 bytes into the requested file.
2621
108
2622
109
      'check_timestamp' -- the timestamp of the server file will be
2623
110
        compared to the timestamp of the local file.  ONLY if the
2624
111
        local file is newer than or the same age as the server file
2625
112
        will reget be used.  If the server file is newer, or the
2626
113
        timestamp is not returned, the entire file will be fetched.
2627
114
2628
115
    NOTE: urlgrabber can do very little to verify that the partial
2629
116
    file on disk is identical to the beginning of the remote file.
2630
117
    You may want to either employ a custom "checkfunc" or simply avoid
2631
118
    using reget in situations where corruption is a concern.
2632
119
2633
120
  user_agent = 'urlgrabber/VERSION'
2634
121
2635
122
    a string, usually of the form 'AGENT/VERSION' that is provided to
2636
123
    HTTP servers in the User-agent header. The module level default
2637
124
    for this option is "urlgrabber/VERSION".
2638
125
2639
126
  http_headers = None
2640
127
2641
128
    a tuple of 2-tuples, each containing a header and value.  These
2642
129
    will be used for http and https requests only.  For example, you
2643
130
    can do
2644
131
      http_headers = (('Pragma', 'no-cache'),)
2645
132
2646
133
  ftp_headers = None
2647
134
2648
135
    this is just like http_headers, but will be used for ftp requests.
2649
136
2650
137
  proxies = None
2651
138
2652
139
    a dictionary that maps protocol schemes to proxy hosts. For
2653
140
    example, to use a proxy server on host "foo" port 3128 for http
2654
141
    and https URLs:
2655
142
      proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
2656
143
    note that proxy authentication information may be provided using
2657
144
    normal URL constructs:
2658
145
      proxies={ 'http' : 'http://user:host@foo:3128' }
2659
146
    Lastly, if proxies is None, the default environment settings will
2660
147
    be used.
2661
148
2662
149
  prefix = None
2663
150
2664
151
    a url prefix that will be prepended to all requested urls.  For
2665
152
    example:
2666
153
      g = URLGrabber(prefix='http://foo.com/mirror/')
2667
154
      g.urlgrab('some/file.txt')
2668
155
      ## this will fetch 'http://foo.com/mirror/some/file.txt'
2669
156
    This option exists primarily to allow identical behavior to
2670
157
    MirrorGroup (and derived) instances.  Note: a '/' will be inserted
2671
158
    if necessary, so you cannot specify a prefix that ends with a
2672
159
    partial file or directory name.
2673
160
2674
161
  opener = None
2675
162
    No-op when using the curl backend (default)
2676
163
2677
164
  cache_openers = True
2678
165
    No-op when using the curl backend (default)
2679
166
2680
167
  data = None
2681
168
2682
169
    Only relevant for the HTTP family (and ignored for other
2683
170
    protocols), this allows HTTP POSTs.  When the data kwarg is
2684
171
    present (and not None), an HTTP request will automatically become
2685
172
    a POST rather than GET.  This is done by direct passthrough to
2686
173
    urllib2.  If you use this, you may also want to set the
2687
174
    'Content-length' and 'Content-type' headers with the http_headers
2688
175
    option.  Note that python 2.2 handles the case of these
2689
176
    badly and if you do not use the proper case (shown here), your
2690
177
    values will be overridden with the defaults.
2691
178
    
2692
179
  urlparser = URLParser()
2693
180
2694
181
    The URLParser class handles pre-processing of URLs, including
2695
182
    auth-handling for user/pass encoded in http urls, file handing
2696
183
    (that is, filenames not sent as a URL), and URL quoting.  If you
2697
184
    want to override any of this behavior, you can pass in a
2698
185
    replacement instance.  See also the 'quote' option.
2699
186
2700
187
  quote = None
2701
188
2702
189
    Whether or not to quote the path portion of a url.
2703
190
      quote = 1    ->  quote the URLs (they're not quoted yet)
2704
191
      quote = 0    ->  do not quote them (they're already quoted)
2705
192
      quote = None ->  guess what to do
2706
193
2707
194
    This option only affects proper urls like 'file:///etc/passwd'; it
2708
195
    does not affect 'raw' filenames like '/etc/passwd'.  The latter
2709
196
    will always be quoted as they are converted to URLs.  Also, only
2710
197
    the path part of a url is quoted.  If you need more fine-grained
2711
198
    control, you should probably subclass URLParser and pass it in via
2712
199
    the 'urlparser' option.
2713
200
2714
201
  ssl_ca_cert = None
2715
202
2716
203
    this option can be used if M2Crypto is available and will be
2717
204
    ignored otherwise.  If provided, it will be used to create an SSL
2718
205
    context.  If both ssl_ca_cert and ssl_context are provided, then
2719
206
    ssl_context will be ignored and a new context will be created from
2720
207
    ssl_ca_cert.
2721
208
2722
209
  ssl_context = None
2723
210
2724
211
    No-op when using the curl backend (default)
2725
212
   
2726
213
2727
214
  self.ssl_verify_peer = True 
2728
215
2729
216
    Check the server's certificate to make sure it is valid with what our CA validates
2730
217
  
2731
218
  self.ssl_verify_host = True
2732
219
2733
220
    Check the server's hostname to make sure it matches the certificate DN
2734
221
2735
222
  self.ssl_key = None
2736
223
2737
224
    Path to the key the client should use to connect/authenticate with
2738
225
2739
226
  self.ssl_key_type = 'PEM' 
2740
227
2741
228
    PEM or DER - format of key
2742
229
     
2743
230
  self.ssl_cert = None
2744
231
2745
232
    Path to the ssl certificate the client should use to to authenticate with
2746
233
2747
234
  self.ssl_cert_type = 'PEM' 
2748
235
2749
236
    PEM or DER - format of certificate
2750
237
    
2751
238
  self.ssl_key_pass = None 
2752
239
2753
240
    password to access the ssl_key
2754
241
    
2755
242
  self.size = None
2756
243
2757
244
    size (in bytes) or Maximum size of the thing being downloaded. 
2758
245
    This is mostly to keep us from exploding with an endless datastream
2759
246
  
2760
247
  self.max_header_size = 2097152 
2761
248
2762
249
    Maximum size (in bytes) of the headers.
2763
250
    
2764
251
2765
252
RETRY RELATED ARGUMENTS
2766
253
2767
254
  retry = None
2768
255
2769
256
    the number of times to retry the grab before bailing.  If this is
2770
257
    zero, it will retry forever. This was intentional... really, it
2771
258
    was :). If this value is not supplied or is supplied but is None
2772
259
    retrying does not occur.
2773
260
2774
261
  retrycodes = [-1,2,4,5,6,7]
2775
262
2776
263
    a sequence of errorcodes (values of e.errno) for which it should
2777
264
    retry. See the doc on URLGrabError for more details on this.  You
2778
265
    might consider modifying a copy of the default codes rather than
2779
266
    building yours from scratch so that if the list is extended in the
2780
267
    future (or one code is split into two) you can still enjoy the
2781
268
    benefits of the default list.  You can do that with something like
2782
269
    this:
2783
270
2784
271
      retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
2785
272
      if 12 not in retrycodes:
2786
273
          retrycodes.append(12)
2787
274
      
2788
275
  checkfunc = None
2789
276
2790
277
    a function to do additional checks. This defaults to None, which
2791
278
    means no additional checking.  The function should simply return
2792
279
    on a successful check.  It should raise URLGrabError on an
2793
280
    unsuccessful check.  Raising of any other exception will be
2794
281
    considered immediate failure and no retries will occur.
2795
282
2796
283
    If it raises URLGrabError, the error code will determine the retry
2797
284
    behavior.  Negative error numbers are reserved for use by these
2798
285
    passed in functions, so you can use many negative numbers for
2799
286
    different types of failure.  By default, -1 results in a retry,
2800
287
    but this can be customized with retrycodes.
2801
288
2802
289
    If you simply pass in a function, it will be given exactly one
2803
290
    argument: a CallbackObject instance with the .url attribute
2804
291
    defined and either .filename (for urlgrab) or .data (for urlread).
2805
292
    For urlgrab, .filename is the name of the local file.  For
2806
293
    urlread, .data is the actual string data.  If you need other
2807
294
    arguments passed to the callback (program state of some sort), you
2808
295
    can do so like this:
2809
296
2810
297
      checkfunc=(function, ('arg1', 2), {'kwarg': 3})
2811
298
2812
299
    if the downloaded file has filename /tmp/stuff, then this will
2813
300
    result in this call (for urlgrab):
2814
301
2815
302
      function(obj, 'arg1', 2, kwarg=3)
2816
303
      # obj.filename = '/tmp/stuff'
2817
304
      # obj.url = 'http://foo.com/stuff'
2818
305
      
2819
306
    NOTE: both the "args" tuple and "kwargs" dict must be present if
2820
307
    you use this syntax, but either (or both) can be empty.
2821
308
2822
309
  failure_callback = None
2823
310
2824
311
    The callback that gets called during retries when an attempt to
2825
312
    fetch a file fails.  The syntax for specifying the callback is
2826
313
    identical to checkfunc, except for the attributes defined in the
2827
314
    CallbackObject instance.  The attributes for failure_callback are:
2828
315
2829
316
      exception = the raised exception
2830
317
      url       = the url we're trying to fetch
2831
318
      tries     = the number of tries so far (including this one)
2832
319
      retry     = the value of the retry option
2833
320
2834
321
    The callback is present primarily to inform the calling program of
2835
322
    the failure, but if it raises an exception (including the one it's
2836
323
    passed) that exception will NOT be caught and will therefore cause
2837
324
    future retries to be aborted.
2838
325
2839
326
    The callback is called for EVERY failure, including the last one.
2840
327
    On the last try, the callback can raise an alternate exception,
2841
328
    but it cannot (without severe trickiness) prevent the exception
2842
329
    from being raised.
2843
330
2844
331
  interrupt_callback = None
2845
332
2846
333
    This callback is called if KeyboardInterrupt is received at any
2847
334
    point in the transfer.  Basically, this callback can have three
2848
335
    impacts on the fetch process based on the way it exits:
2849
336
2850
337
      1) raise no exception: the current fetch will be aborted, but
2851
338
         any further retries will still take place
2852
339
2853
340
      2) raise a URLGrabError: if you're using a MirrorGroup, then
2854
341
         this will prompt a failover to the next mirror according to
2855
342
         the behavior of the MirrorGroup subclass.  It is recommended
2856
343
         that you raise URLGrabError with code 15, 'user abort'.  If
2857
344
         you are NOT using a MirrorGroup subclass, then this is the
2858
345
         same as (3).
2859
346
2860
347
      3) raise some other exception (such as KeyboardInterrupt), which
2861
348
         will not be caught at either the grabber or mirror levels.
2862
349
         That is, it will be raised up all the way to the caller.
2863
350
2864
351
    This callback is very similar to failure_callback.  They are
2865
352
    passed the same arguments, so you could use the same function for
2866
353
    both.
2867
354
      
2868
355
BANDWIDTH THROTTLING
2869
356
2870
357
  urlgrabber supports throttling via two values: throttle and
2871
358
  bandwidth Between the two, you can either specify and absolute
2872
359
  throttle threshold or specify a theshold as a fraction of maximum
2873
360
  available bandwidth.
2874
361
2875
362
  throttle is a number - if it's an int, it's the bytes/second
2876
363
  throttle limit.  If it's a float, it is first multiplied by
2877
364
  bandwidth.  If throttle == 0, throttling is disabled.  If None, the
2878
365
  module-level default (which can be set with set_throttle) is used.
2879
366
2880
367
  bandwidth is the nominal max bandwidth in bytes/second.  If throttle
2881
368
  is a float and bandwidth == 0, throttling is disabled.  If None, the
2882
369
  module-level default (which can be set with set_bandwidth) is used.
2883
370
2884
371
  THROTTLING EXAMPLES:
2885
372
2886
373
  Lets say you have a 100 Mbps connection.  This is (about) 10^8 bits
2887
374
  per second, or 12,500,000 Bytes per second.  You have a number of
2888
375
  throttling options:
2889
376
2890
377
  *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
2891
378
2892
379
     This will limit urlgrab to use half of your available bandwidth.
2893
380
2894
381
  *) set_throttle(6250000) # throttle is an int
2895
382
2896
383
     This will also limit urlgrab to use half of your available
2897
384
     bandwidth, regardless of what bandwidth is set to.
2898
385
2899
386
  *) set_throttle(6250000); set_throttle(1.0) # float
2900
387
2901
388
     Use half your bandwidth
2902
389
2903
390
  *) set_throttle(6250000); set_throttle(2.0) # float
2904
391
2905
392
    Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
2906
393
2907
394
  *) set_throttle(6250000); set_throttle(0) # throttle = 0
2908
395
2909
396
     Disable throttling - this is more efficient than a very large
2910
397
     throttle setting.
2911
398
2912
399
  *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
2913
400
2914
401
     Disable throttling - this is the default when the module is loaded.
2915
402
2916
403
  SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
2917
404
2918
405
  While this is flexible, it's not extremely obvious to the user.  I
2919
406
  suggest you implement a float throttle as a percent to make the
2920
407
  distinction between absolute and relative throttling very explicit.
2921
408
2922
409
  Also, you may want to convert the units to something more convenient
2923
410
  than bytes/second, such as kbps or kB/s, etc.
2924
411
2925
412
"""
2926
413
2927
414
2928
415
2929
416
import os
2930
417
import sys
2931
418
import urlparse
2932
419
import time
2933
420
import string
2934
421
import urllib
2935
422
import urllib2
2936
423
import mimetools
2937
424
import thread
2938
425
import types
2939
426
import stat
2940
427
import pycurl
2941
428
from ftplib import parse150
2942
429
from StringIO import StringIO
2943
430
from httplib import HTTPException
2944
431
import socket
2945
432
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
2946
433
2947
434
########################################################################
2948
435
#                     MODULE INITIALIZATION
2949
436
########################################################################
2950
437
try:
2951
438
    exec('from ' + (__name__.split('.'))[0] + ' import __version__')
2952
439
except:
2953
440
    __version__ = '???'
2954
441
2955
442
try:
2956
443
    # this part isn't going to do much - need to talk to gettext
2957
444
    from i18n import _
2958
445
except ImportError, msg:
2959
446
    def _(st): return st
2960
447
    
2961
448
########################################################################
2962
449
# functions for debugging output.  These functions are here because they
2963
450
# are also part of the module initialization.
2964
451
DEBUG = None
2965
452
def set_logger(DBOBJ):
2966
453
    """Set the DEBUG object.  This is called by _init_default_logger when
2967
454
    the environment variable URLGRABBER_DEBUG is set, but can also be
2968
455
    called by a calling program.  Basically, if the calling program uses
2969
456
    the logging module and would like to incorporate urlgrabber logging,
2970
457
    then it can do so this way.  It's probably not necessary as most
2971
458
    internal logging is only for debugging purposes.
2972
459
2973
460
    The passed-in object should be a logging.Logger instance.  It will
2974
461
    be pushed into the keepalive and byterange modules if they're
2975
462
    being used.  The mirror module pulls this object in on import, so
2976
463
    you will need to manually push into it.  In fact, you may find it
2977
464
    tidier to simply push your logging object (or objects) into each
2978
465
    of these modules independently.
2979
466
    """
2980
467
2981
468
    global DEBUG
2982
469
    DEBUG = DBOBJ
2983
470
2984
471
def _init_default_logger(logspec=None):
2985
472
    '''Examines the environment variable URLGRABBER_DEBUG and creates
2986
473
    a logging object (logging.logger) based on the contents.  It takes
2987
474
    the form
2988
475
2989
476
      URLGRABBER_DEBUG=level,filename
2990
477
      
2991
478
    where "level" can be either an integer or a log level from the
2992
479
    logging module (DEBUG, INFO, etc).  If the integer is zero or
2993
480
    less, logging will be disabled.  Filename is the filename where
2994
481
    logs will be sent.  If it is "-", then stdout will be used.  If
2995
482
    the filename is empty or missing, stderr will be used.  If the
2996
483
    variable cannot be processed or the logging module cannot be
2997
484
    imported (python < 2.3) then logging will be disabled.  Here are
2998
485
    some examples:
2999
486
3000
487
      URLGRABBER_DEBUG=1,debug.txt   # log everything to debug.txt
3001
488
      URLGRABBER_DEBUG=WARNING,-     # log warning and higher to stdout
3002
489
      URLGRABBER_DEBUG=INFO          # log info and higher to stderr
3003
490
      
3004
491
    This funtion is called during module initialization.  It is not
3005
492
    intended to be called from outside.  The only reason it is a
3006
493
    function at all is to keep the module-level namespace tidy and to
3007
494
    collect the code into a nice block.'''
3008
495
3009
496
    try:
3010
497
        if logspec is None:
3011
498
            logspec = os.environ['URLGRABBER_DEBUG']
3012
499
        dbinfo = logspec.split(',')
3013
500
        import logging
3014
501
        level = logging._levelNames.get(dbinfo[0], None)
3015
502
        if level is None: level = int(dbinfo[0])
3016
503
        if level < 1: raise ValueError()
3017
504
3018
505
        formatter = logging.Formatter('%(asctime)s %(message)s')
3019
506
        if len(dbinfo) > 1: filename = dbinfo[1]
3020
507
        else: filename = ''
3021
508
        if filename == '': handler = logging.StreamHandler(sys.stderr)
3022
509
        elif filename == '-': handler = logging.StreamHandler(sys.stdout)
3023
510
        else:  handler = logging.FileHandler(filename)
3024
511
        handler.setFormatter(formatter)
3025
512
        DBOBJ = logging.getLogger('urlgrabber')
3026
513
        DBOBJ.addHandler(handler)
3027
514
        DBOBJ.setLevel(level)
3028
515
    except (KeyError, ImportError, ValueError):
3029
516
        DBOBJ = None
3030
517
    set_logger(DBOBJ)
3031
518
3032
519
def _log_package_state():
3033
520
    if not DEBUG: return
3034
521
    DEBUG.info('urlgrabber version  = %s' % __version__)
3035
522
    DEBUG.info('trans function "_"  = %s' % _)
3036
523
        
3037
524
_init_default_logger()
3038
525
_log_package_state()
3039
526
3040
527
3041
528
# normally this would be from i18n or something like it ...
3042
529
def _(st):
3043
530
    return st
3044
531
3045
532
########################################################################
3046
533
#                 END MODULE INITIALIZATION
3047
534
########################################################################
3048
535
3049
536
3050
537
3051
538
class URLGrabError(IOError):
3052
539
    """
3053
540
    URLGrabError error codes:
3054
541
3055
542
      URLGrabber error codes (0 -- 255)
3056
543
        0    - everything looks good (you should never see this)
3057
544
        1    - malformed url
3058
545
        2    - local file doesn't exist
3059
546
        3    - request for non-file local file (dir, etc)
3060
547
        4    - IOError on fetch
3061
548
        5    - OSError on fetch
3062
549
        6    - no content length header when we expected one
3063
550
        7    - HTTPException
3064
551
        8    - Exceeded read limit (for urlread)
3065
552
        9    - Requested byte range not satisfiable.
3066
553
        10   - Byte range requested, but range support unavailable
3067
554
        11   - Illegal reget mode
3068
555
        12   - Socket timeout
3069
556
        13   - malformed proxy url
3070
557
        14   - HTTPError (includes .code and .exception attributes)
3071
558
        15   - user abort
3072
559
        16   - error writing to local file
3073
560
        
3074
561
      MirrorGroup error codes (256 -- 511)
3075
562
        256  - No more mirrors left to try
3076
563
3077
564
      Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
3078
565
        [ this range reserved for application-specific error codes ]
3079
566
3080
567
      Retry codes (< 0)
3081
568
        -1   - retry the download, unknown reason
3082
569
3083
570
    Note: to test which group a code is in, you can simply do integer
3084
571
    division by 256: e.errno / 256
3085
572
3086
573
    Negative codes are reserved for use by functions passed in to
3087
574
    retrygrab with checkfunc.  The value -1 is built in as a generic
3088
575
    retry code and is already included in the retrycodes list.
3089
576
    Therefore, you can create a custom check function that simply
3090
577
    returns -1 and the fetch will be re-tried.  For more customized
3091
578
    retries, you can use other negative number and include them in
3092
579
    retry-codes.  This is nice for outputting useful messages about
3093
580
    what failed.
3094
581
3095
582
    You can use these error codes like so:
3096
583
      try: urlgrab(url)
3097
584
      except URLGrabError, e:
3098
585
         if e.errno == 3: ...
3099
586
           # or
3100
587
         print e.strerror
3101
588
           # or simply
3102
589
         print e  #### print '[Errno %i] %s' % (e.errno, e.strerror)
3103
590
    """
3104
591
    def __init__(self, *args):
3105
592
        IOError.__init__(self, *args)
3106
593
        self.url = "No url specified"
3107
594
3108
595
class CallbackObject:
3109
596
    """Container for returned callback data.
3110
597
3111
598
    This is currently a dummy class into which urlgrabber can stuff
3112
599
    information for passing to callbacks.  This way, the prototype for
3113
600
    all callbacks is the same, regardless of the data that will be
3114
601
    passed back.  Any function that accepts a callback function as an
3115
602
    argument SHOULD document what it will define in this object.
3116
603
3117
604
    It is possible that this class will have some greater
3118
605
    functionality in the future.
3119
606
    """
3120
607
    def __init__(self, **kwargs):
3121
608
        self.__dict__.update(kwargs)
3122
609
3123
610
def urlgrab(url, filename=None, **kwargs):
3124
611
    """grab the file at <url> and make a local copy at <filename>
3125
612
    If filename is none, the basename of the url is used.
3126
613
    urlgrab returns the filename of the local file, which may be different
3127
614
    from the passed-in filename if the copy_local kwarg == 0.
3128
615
    
3129
616
    See module documentation for a description of possible kwargs.
3130
617
    """
3131
618
    return default_grabber.urlgrab(url, filename, **kwargs)
3132
619
3133
620
def urlopen(url, **kwargs):
3134
621
    """open the url and return a file object
3135
622
    If a progress object or throttle specifications exist, then
3136
623
    a special file object will be returned that supports them.
3137
624
    The file object can be treated like any other file object.
3138
625
    
3139
626
    See module documentation for a description of possible kwargs.
3140
627
    """
3141
628
    return default_grabber.urlopen(url, **kwargs)
3142
629
3143
630
def urlread(url, limit=None, **kwargs):
3144
631
    """read the url into a string, up to 'limit' bytes
3145
632
    If the limit is exceeded, an exception will be thrown.  Note that urlread
3146
633
    is NOT intended to be used as a way of saying "I want the first N bytes"
3147
634
    but rather 'read the whole file into memory, but don't use too much'
3148
635
    
3149
636
    See module documentation for a description of possible kwargs.
3150
637
    """
3151
638
    return default_grabber.urlread(url, limit, **kwargs)
3152
639
3153
640
3154
641
class URLParser:
3155
642
    """Process the URLs before passing them to urllib2.
3156
643
3157
644
    This class does several things:
3158
645
3159
646
      * add any prefix
3160
647
      * translate a "raw" file to a proper file: url
3161
648
      * handle any http or https auth that's encoded within the url
3162
649
      * quote the url
3163
650
3164
651
    Only the "parse" method is called directly, and it calls sub-methods.
3165
652
3166
653
    An instance of this class is held in the options object, which
3167
654
    means that it's easy to change the behavior by sub-classing and
3168
655
    passing the replacement in.  It need only have a method like:
3169
656
3170
657
        url, parts = urlparser.parse(url, opts)
3171
658
    """
3172
659
3173
660
    def parse(self, url, opts):
3174
661
        """parse the url and return the (modified) url and its parts
3175
662
3176
663
        Note: a raw file WILL be quoted when it's converted to a URL.
3177
664
        However, other urls (ones which come with a proper scheme) may
3178
665
        or may not be quoted according to opts.quote
3179
666
3180
667
          opts.quote = 1     --> quote it
3181
668
          opts.quote = 0     --> do not quote it
3182
669
          opts.quote = None  --> guess
3183
670
        """
3184
671
        quote = opts.quote
3185
672
        
3186
673
        if opts.prefix:
3187
674
            url = self.add_prefix(url, opts.prefix)
3188
675
            
3189
676
        parts = urlparse.urlparse(url)
3190
677
        (scheme, host, path, parm, query, frag) = parts
3191
678
3192
679
        if not scheme or (len(scheme) == 1 and scheme in string.letters):
3193
680
            # if a scheme isn't specified, we guess that it's "file:"
3194
681
            if url[0] not in '/\\': url = os.path.abspath(url)
3195
682
            url = 'file:' + urllib.pathname2url(url)
3196
683
            parts = urlparse.urlparse(url)
3197
684
            quote = 0 # pathname2url quotes, so we won't do it again
3198
685
            
3199
686
        if scheme in ['http', 'https']:
3200
687
            parts = self.process_http(parts, url)
3201
688
            
3202
689
        if quote is None:
3203
690
            quote = self.guess_should_quote(parts)
3204
691
        if quote:
3205
692
            parts = self.quote(parts)
3206
693
        
3207
694
        url = urlparse.urlunparse(parts)
3208
695
        return url, parts
3209
696
3210
697
    def add_prefix(self, url, prefix):
3211
698
        if prefix[-1] == '/' or url[0] == '/':
3212
699
            url = prefix + url
3213
700
        else:
3214
701
            url = prefix + '/' + url
3215
702
        return url
3216
703
3217
704
    def process_http(self, parts, url):
3218
705
        (scheme, host, path, parm, query, frag) = parts
3219
706
        # TODO: auth-parsing here, maybe? pycurl doesn't really need it
3220
707
        return (scheme, host, path, parm, query, frag)
3221
708
3222
709
    def quote(self, parts):
3223
710
        """quote the URL
3224
711
3225
712
        This method quotes ONLY the path part.  If you need to quote
3226
713
        other parts, you should override this and pass in your derived
3227
714
        class.  The other alternative is to quote other parts before
3228
715
        passing into urlgrabber.
3229
716
        """
3230
717
        (scheme, host, path, parm, query, frag) = parts
3231
718
        path = urllib.quote(path)
3232
719
        return (scheme, host, path, parm, query, frag)
3233
720
3234
721
    hexvals = '0123456789ABCDEF'
3235
722
    def guess_should_quote(self, parts):
3236
723
        """
3237
724
        Guess whether we should quote a path.  This amounts to
3238
725
        guessing whether it's already quoted.
3239
726
3240
727
        find ' '   ->  1
3241
728
        find '%'   ->  1
3242
729
        find '%XX' ->  0
3243
730
        else       ->  1
3244
731
        """
3245
732
        (scheme, host, path, parm, query, frag) = parts
3246
733
        if ' ' in path:
3247
734
            return 1
3248
735
        ind = string.find(path, '%')
3249
736
        if ind > -1:
3250
737
            while ind > -1:
3251
738
                if len(path) < ind+3:
3252
739
                    return 1
3253
740
                code = path[ind+1:ind+3].upper()
3254
741
                if     code[0] not in self.hexvals or \
3255
742
                       code[1] not in self.hexvals:
3256
743
                    return 1
3257
744
                ind = string.find(path, '%', ind+1)
3258
745
            return 0
3259
746
        return 1
3260
747
    
3261
748
class URLGrabberOptions:
3262
749
    """Class to ease kwargs handling."""
3263
750
3264
751
    def __init__(self, delegate=None, **kwargs):
3265
752
        """Initialize URLGrabberOptions object.
3266
753
        Set default values for all options and then update options specified
3267
754
        in kwargs.
3268
755
        """
3269
756
        self.delegate = delegate
3270
757
        if delegate is None:
3271
758
            self._set_defaults()
3272
759
        self._set_attributes(**kwargs)
3273
760
    
3274
761
    def __getattr__(self, name):
3275
762
        if self.delegate and hasattr(self.delegate, name):
3276
763
            return getattr(self.delegate, name)
3277
764
        raise AttributeError, name
3278
765
    
3279
766
    def raw_throttle(self):
3280
767
        """Calculate raw throttle value from throttle and bandwidth 
3281
768
        values.
3282
769
        """
3283
770
        if self.throttle <= 0:  
3284
771
            return 0
3285
772
        elif type(self.throttle) == type(0): 
3286
773
            return float(self.throttle)
3287
774
        else: # throttle is a float
3288
775
            return self.bandwidth * self.throttle
3289
776
        
3290
777
    def derive(self, **kwargs):
3291
778
        """Create a derived URLGrabberOptions instance.
3292
779
        This method creates a new instance and overrides the
3293
780
        options specified in kwargs.
3294
781
        """
3295
782
        return URLGrabberOptions(delegate=self, **kwargs)
3296
783
        
3297
784
    def _set_attributes(self, **kwargs):
3298
785
        """Update object attributes with those provided in kwargs."""
3299
786
        self.__dict__.update(kwargs)
3300
787
        if kwargs.has_key('range'):
3301
788
            # normalize the supplied range value
3302
789
            self.range = range_tuple_normalize(self.range)
3303
790
        if not self.reget in [None, 'simple', 'check_timestamp']:
3304
791
            raise URLGrabError(11, _('Illegal reget mode: %s') \
3305
792
                               % (self.reget, ))
3306
793
3307
794
    def _set_defaults(self):
3308
795
        """Set all options to their default values. 
3309
796
        When adding new options, make sure a default is
3310
797
        provided here.
3311
798
        """
3312
799
        self.progress_obj = None
3313
800
        self.throttle = 1.0
3314
801
        self.bandwidth = 0
3315
802
        self.retry = None
3316
803
        self.retrycodes = [-1,2,4,5,6,7]
3317
804
        self.checkfunc = None
3318
805
        self.copy_local = 0
3319
806
        self.close_connection = 0
3320
807
        self.range = None
3321
808
        self.user_agent = 'urlgrabber/%s' % __version__
3322
809
        self.keepalive = 1
3323
810
        self.proxies = None
3324
811
        self.reget = None
3325
812
        self.failure_callback = None
3326
813
        self.interrupt_callback = None
3327
814
        self.prefix = None
3328
815
        self.opener = None
3329
816
        self.cache_openers = True
3330
817
        self.timeout = 300
3331
818
        self.text = None
3332
819
        self.http_headers = None
3333
820
        self.ftp_headers = None
3334
821
        self.data = None
3335
822
        self.urlparser = URLParser()
3336
823
        self.quote = None
3337
824
        self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
3338
825
        self.ssl_context = None # no-op in pycurl
3339
826
        self.ssl_verify_peer = True # check peer's cert for authenticityb
3340
827
        self.ssl_verify_host = True # make sure who they are and who the cert is for matches
3341
828
        self.ssl_key = None # client key
3342
829
        self.ssl_key_type = 'PEM' #(or DER)
3343
830
        self.ssl_cert = None # client cert
3344
831
        self.ssl_cert_type = 'PEM' # (or DER)
3345
832
        self.ssl_key_pass = None # password to access the key
3346
833
        self.size = None # if we know how big the thing we're getting is going
3347
834
                         # to be. this is ultimately a MAXIMUM size for the file
3348
835
        self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
3349
836
        
3350
837
    def __repr__(self):
3351
838
        return self.format()
3352
839
        
3353
840
    def format(self, indent='  '):
3354
841
        keys = self.__dict__.keys()
3355
842
        if self.delegate is not None:
3356
843
            keys.remove('delegate')
3357
844
        keys.sort()
3358
845
        s = '{\n'
3359
846
        for k in keys:
3360
847
            s = s + indent + '%-15s: %s,\n' % \
3361
848
                (repr(k), repr(self.__dict__[k]))
3362
849
        if self.delegate:
3363
850
            df = self.delegate.format(indent + '  ')
3364
851
            s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
3365
852
        s = s + indent + '}'
3366
853
        return s
3367
854
3368
855
class URLGrabber:
3369
856
    """Provides easy opening of URLs with a variety of options.
3370
857
    
3371
858
    All options are specified as kwargs. Options may be specified when
3372
859
    the class is created and may be overridden on a per request basis.
3373
860
    
3374
861
    New objects inherit default values from default_grabber.
3375
862
    """
3376
863
    
3377
864
    def __init__(self, **kwargs):
3378
865
        self.opts = URLGrabberOptions(**kwargs)
3379
866
    
3380
867
    def _retry(self, opts, func, *args):
3381
868
        tries = 0
3382
869
        while 1:
3383
870
            # there are only two ways out of this loop.  The second has
3384
871
            # several "sub-ways"
3385
872
            #   1) via the return in the "try" block
3386
873
            #   2) by some exception being raised
3387
874
            #      a) an excepton is raised that we don't "except"
3388
875
            #      b) a callback raises ANY exception
3389
876
            #      c) we're not retry-ing or have run out of retries
3390
877
            #      d) the URLGrabError code is not in retrycodes
3391
878
            # beware of infinite loops :)
3392
879
            tries = tries + 1
3393
880
            exception = None
3394
881
            retrycode = None
3395
882
            callback  = None
3396
883
            if DEBUG: DEBUG.info('attempt %i/%s: %s',
3397
884
                                 tries, opts.retry, args[0])
3398
885
            try:
3399
886
                r = apply(func, (opts,) + args, {})
3400
887
                if DEBUG: DEBUG.info('success')
3401
888
                return r
3402
889
            except URLGrabError, e:
3403
890
                exception = e
3404
891
                callback = opts.failure_callback
3405
892
                retrycode = e.errno
3406
893
            except KeyboardInterrupt, e:
3407
894
                exception = e
3408
895
                callback = opts.interrupt_callback
3409
896
3410
897
            if DEBUG: DEBUG.info('exception: %s', exception)
3411
898
            if callback:
3412
899
                if DEBUG: DEBUG.info('calling callback: %s', callback)
3413
900
                cb_func, cb_args, cb_kwargs = self._make_callback(callback)
3414
901
                obj = CallbackObject(exception=exception, url=args[0],
3415
902
                                     tries=tries, retry=opts.retry)
3416
903
                cb_func(obj, *cb_args, **cb_kwargs)
3417
904
3418
905
            if (opts.retry is None) or (tries == opts.retry):
3419
906
                if DEBUG: DEBUG.info('retries exceeded, re-raising')
3420
907
                raise
3421
908
3422
909
            if (retrycode is not None) and (retrycode not in opts.retrycodes):
3423
910
                if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
3424
911
                                     retrycode, opts.retrycodes)
3425
912
                raise
3426
913
    
3427
914
    def urlopen(self, url, **kwargs):
3428
915
        """open the url and return a file object
3429
916
        If a progress object or throttle value specified when this 
3430
917
        object was created, then  a special file object will be 
3431
918
        returned that supports them. The file object can be treated 
3432
919
        like any other file object.
3433
920
        """
3434
921
        opts = self.opts.derive(**kwargs)
3435
922
        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
3436
923
        (url,parts) = opts.urlparser.parse(url, opts) 
3437
924
        def retryfunc(opts, url):
3438
925
            return PyCurlFileObject(url, filename=None, opts=opts)
3439
926
        return self._retry(opts, retryfunc, url)
3440
927
    
3441
928
    def urlgrab(self, url, filename=None, **kwargs):
3442
929
        """grab the file at <url> and make a local copy at <filename>
3443
930
        If filename is none, the basename of the url is used.
3444
931
        urlgrab returns the filename of the local file, which may be 
3445
932
        different from the passed-in filename if copy_local == 0.
3446
933
        """
3447
934
        opts = self.opts.derive(**kwargs)
3448
935
        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
3449
936
        (url,parts) = opts.urlparser.parse(url, opts) 
3450
937
        (scheme, host, path, parm, query, frag) = parts
3451
938
        if filename is None:
3452
939
            filename = os.path.basename( urllib.unquote(path) )
3453
940
        if scheme == 'file' and not opts.copy_local:
3454
941
            # just return the name of the local file - don't make a 
3455
942
            # copy currently
3456
943
            path = urllib.url2pathname(path)
3457
944
            if host:
3458
945
                path = os.path.normpath('//' + host + path)
3459
946
            if not os.path.exists(path):
3460
947
                err = URLGrabError(2, 
3461
948
                      _('Local file does not exist: %s') % (path, ))
3462
949
                err.url = url
3463
950
                raise err
3464
951
            elif not os.path.isfile(path):
3465
952
                err = URLGrabError(3, 
3466
953
                                 _('Not a normal file: %s') % (path, ))
3467
954
                err.url = url
3468
955
                raise err
3469
956
3470
957
            elif not opts.range:
3471
958
                if not opts.checkfunc is None:
3472
959
                    cb_func, cb_args, cb_kwargs = \
3473
960
                       self._make_callback(opts.checkfunc)
3474
961
                    obj = CallbackObject()
3475
962
                    obj.filename = path
3476
963
                    obj.url = url
3477
964
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)        
3478
965
                return path
3479
966
        
3480
967
        def retryfunc(opts, url, filename):
3481
968
            fo = PyCurlFileObject(url, filename, opts)
3482
969
            try:
3483
970
                fo._do_grab()
3484
971
                if not opts.checkfunc is None:
3485
972
                    cb_func, cb_args, cb_kwargs = \
3486
973
                             self._make_callback(opts.checkfunc)
3487
974
                    obj = CallbackObject()
3488
975
                    obj.filename = filename
3489
976
                    obj.url = url
3490
977
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
3491
978
            finally:
3492
979
                fo.close()
3493
980
            return filename
3494
981
        
3495
982
        return self._retry(opts, retryfunc, url, filename)
3496
983
    
3497
984
    def urlread(self, url, limit=None, **kwargs):
3498
985
        """read the url into a string, up to 'limit' bytes
3499
986
        If the limit is exceeded, an exception will be thrown.  Note
3500
987
        that urlread is NOT intended to be used as a way of saying 
3501
988
        "I want the first N bytes" but rather 'read the whole file 
3502
989
        into memory, but don't use too much'
3503
990
        """
3504
991
        opts = self.opts.derive(**kwargs)
3505
992
        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
3506
993
        (url,parts) = opts.urlparser.parse(url, opts) 
3507
994
        if limit is not None:
3508
995
            limit = limit + 1
3509
996
            
3510
997
        def retryfunc(opts, url, limit):
3511
998
            fo = PyCurlFileObject(url, filename=None, opts=opts)
3512
999
            s = ''
3513
1000
            try:
3514
1001
                # this is an unfortunate thing.  Some file-like objects
3515
1002
                # have a default "limit" of None, while the built-in (real)
3516
1003
                # file objects have -1.  They each break the other, so for
3517
1004
                # now, we just force the default if necessary.
3518
1005
                if limit is None: s = fo.read()
3519
1006
                else: s = fo.read(limit)
3520
1007
3521
1008
                if not opts.checkfunc is None:
3522
1009
                    cb_func, cb_args, cb_kwargs = \
3523
1010
                             self._make_callback(opts.checkfunc)
3524
1011
                    obj = CallbackObject()
3525
1012
                    obj.data = s
3526
1013
                    obj.url = url
3527
1014
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
3528
1015
            finally:
3529
1016
                fo.close()
3530
1017
            return s
3531
1018
            
3532
1019
        s = self._retry(opts, retryfunc, url, limit)
3533
1020
        if limit and len(s) > limit:
3534
1021
            err = URLGrabError(8, 
3535
1022
                               _('Exceeded limit (%i): %s') % (limit, url))
3536
1023
            err.url = url
3537
1024
            raise err
3538
1025
3539
1026
        return s
3540
1027
        
3541
1028
    def _make_callback(self, callback_obj):
3542
1029
        if callable(callback_obj):
3543
1030
            return callback_obj, (), {}
3544
1031
        else:
3545
1032
            return callback_obj
3546
1033
3547
1034
# create the default URLGrabber used by urlXXX functions.
3548
1035
# NOTE: actual defaults are set in URLGrabberOptions
3549
1036
default_grabber = URLGrabber()
3550
1037
3551
1038
3552
1039
class PyCurlFileObject():
3553
1040
    def __init__(self, url, filename, opts):
3554
1041
        self.fo = None
3555
1042
        self._hdr_dump = ''
3556
1043
        self._parsed_hdr = None
3557
1044
        self.url = url
3558
1045
        self.scheme = urlparse.urlsplit(self.url)[0]
3559
1046
        self.filename = filename
3560
1047
        self.append = False
3561
1048
        self.reget_time = None
3562
1049
        self.opts = opts
3563
1050
        if self.opts.reget == 'check_timestamp':
3564
1051
            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
3565
1052
        self._complete = False
3566
1053
        self._rbuf = ''
3567
1054
        self._rbufsize = 1024*8
3568
1055
        self._ttime = time.time()
3569
1056
        self._tsize = 0
3570
1057
        self._amount_read = 0
3571
1058
        self._reget_length = 0
3572
1059
        self._prog_running = False
3573
1060
        self._error = (None, None)
3574
1061
        self.size = 0
3575
1062
        self._hdr_ended = False
3576
1063
        self._do_open()
3577
1064
        
3578
1065
3579
1066
    def geturl(self):
3580
1067
        """ Provide the geturl() method, used to be got from
3581
1068
            urllib.addinfourl, via. urllib.URLopener.* """
3582
1069
        return self.url
3583
1070
        
3584
1071
    def __getattr__(self, name):
3585
1072
        """This effectively allows us to wrap at the instance level.
3586
1073
        Any attribute not found in _this_ object will be searched for
3587
1074
        in self.fo.  This includes methods."""
3588
1075
3589
1076
        if hasattr(self.fo, name):
3590
1077
            return getattr(self.fo, name)
3591
1078
        raise AttributeError, name
3592
1079
3593
1080
    def _retrieve(self, buf):
3594
1081
        try:
3595
1082
            if not self._prog_running:
3596
1083
                if self.opts.progress_obj:
3597
1084
                    size  = self.size + self._reget_length
3598
1085
                    self.opts.progress_obj.start(self._prog_reportname, 
3599
1086
                                                 urllib.unquote(self.url), 
3600
1087
                                                 self._prog_basename, 
3601
1088
                                                 size=size,
3602
1089
                                                 text=self.opts.text)
3603
1090
                    self._prog_running = True
3604
1091
                    self.opts.progress_obj.update(self._amount_read)
3605
1092
3606
1093
            self._amount_read += len(buf)
3607
1094
            self.fo.write(buf)
3608
1095
            return len(buf)
3609
1096
        except KeyboardInterrupt:
3610
1097
            return -1
3611
1098
            
3612
1099
    def _hdr_retrieve(self, buf):
3613
1100
        if self._hdr_ended:
3614
1101
            self._hdr_dump = ''
3615
1102
            self.size = 0
3616
1103
            self._hdr_ended = False
3617
1104
3618
1105
        if self._over_max_size(cur=len(self._hdr_dump), 
3619
1106
                               max_size=self.opts.max_header_size):
3620
1107
            return -1
3621
1108
        try:
3622
1109
            self._hdr_dump += buf
3623
1110
            # we have to get the size before we do the progress obj start
3624
1111
            # but we can't do that w/o making it do 2 connects, which sucks
3625
1112
            # so we cheat and stuff it in here in the hdr_retrieve
3626
1113
            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
3627
1114
                length = buf.split(':')[1]
3628
1115
                self.size = int(length)
3629
1116
            elif self.scheme in ['ftp']:
3630
1117
                s = None
3631
1118
                if buf.startswith('213 '):
3632
1119
                    s = buf[3:].strip()
3633
1120
                elif buf.startswith('150 '):
3634
1121
                    s = parse150(buf)
3635
1122
                if s:
3636
1123
                    self.size = int(s)
3637
1124
                    
3638
1125
            if buf.lower().find('location') != -1:
3639
1126
                location = ':'.join(buf.split(':')[1:])
3640
1127
                location = location.strip()
3641
1128
                self.scheme = urlparse.urlsplit(location)[0]
3642
1129
                self.url = location
3643
1130
                
3644
1131
            if len(self._hdr_dump) != 0 and buf == '\r\n':
3645
1132
                self._hdr_ended = True
3646
1133
                if DEBUG: DEBUG.info('header ended:')
3647
1134
                
3648
1135
            return len(buf)
3649
1136
        except KeyboardInterrupt:
3650
1137
            return pycurl.READFUNC_ABORT
3651
1138
3652
1139
    def _return_hdr_obj(self):
3653
1140
        if self._parsed_hdr:
3654
1141
            return self._parsed_hdr
3655
1142
        statusend = self._hdr_dump.find('\n')
3656
1143
        statusend += 1 # ridiculous as it may seem.
3657
1144
        hdrfp = StringIO()
3658
1145
        hdrfp.write(self._hdr_dump[statusend:])
3659
1146
        hdrfp.seek(0)
3660
1147
        self._parsed_hdr =  mimetools.Message(hdrfp)
3661
1148
        return self._parsed_hdr
3662
1149
    
3663
1150
    hdr = property(_return_hdr_obj)
3664
1151
    http_code = property(fget=
3665
1152
                 lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
3666
1153
3667
1154
    def _set_opts(self, opts={}):
3668
1155
        # XXX
3669
1156
        if not opts:
3670
1157
            opts = self.opts
3671
1158
3672
1159
3673
1160
        # defaults we're always going to set
3674
1161
        self.curl_obj.setopt(pycurl.NOPROGRESS, False)
3675
1162
        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
3676
1163
        self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
3677
1164
        self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
3678
1165
        self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
3679
1166
        self.curl_obj.setopt(pycurl.FAILONERROR, True)
3680
1167
        self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
3681
1168
        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
3682
1169
        
3683
1170
        if DEBUG:
3684
1171
            self.curl_obj.setopt(pycurl.VERBOSE, True)
3685
1172
        if opts.user_agent:
3686
1173
            self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
3687
1174
        
3688
1175
        # maybe to be options later
3689
1176
        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
3690
1177
        self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
3691
1178
        
3692
1179
        # timeouts
3693
1180
        timeout = 300
3694
1181
        if hasattr(opts, 'timeout'):
3695
1182
            timeout = int(opts.timeout or 0)
3696
1183
        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
3697
1184
        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
3698
1185
        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
3699
1186
3700
1187
        # ssl options
3701
1188
        if self.scheme == 'https':
3702
1189
            if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
3703
1190
                self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
3704
1191
                self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
3705
1192
            self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
3706
1193
            self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
3707
1194
            if opts.ssl_key:
3708
1195
                self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
3709
1196
            if opts.ssl_key_type:
3710
1197
                self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
3711
1198
            if opts.ssl_cert:
3712
1199
                self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
3713
1200
            if opts.ssl_cert_type:                
3714
1201
                self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
3715
1202
            if opts.ssl_key_pass:
3716
1203
                self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
3717
1204
3718
1205
        #headers:
3719
1206
        if opts.http_headers and self.scheme in ('http', 'https'):
3720
1207
            headers = []
3721
1208
            for (tag, content) in opts.http_headers:
3722
1209
                headers.append('%s:%s' % (tag, content))
3723
1210
            self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
3724
1211
3725
1212
        # ranges:
3726
1213
        if opts.range or opts.reget:
3727
1214
            range_str = self._build_range()
3728
1215
            if range_str:
3729
1216
                self.curl_obj.setopt(pycurl.RANGE, range_str)
3730
1217
            
3731
1218
        # throttle/bandwidth
3732
1219
        if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
3733
1220
            self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
3734
1221
            
3735
1222
        # proxy settings
3736
1223
        if opts.proxies:
3737
1224
            for (scheme, proxy) in opts.proxies.items():
3738
1225
                if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
3739
1226
                    if scheme not in ('ftp'):
3740
1227
                        continue
3741
1228
                    else:
3742
1229
                        if proxy == '_none_': proxy = ""
3743
1230
                        self.curl_obj.setopt(pycurl.PROXY, proxy)
3744
1231
                elif self.scheme in ('http', 'https'):
3745
1232
                    if scheme not in ('http', 'https'):
3746
1233
                        continue
3747
1234
                    else:
3748
1235
                        if proxy == '_none_': proxy = ""
3749
1236
                        self.curl_obj.setopt(pycurl.PROXY, proxy)
3750
1237
            
3751
1238
        # FIXME username/password/auth settings
3752
1239
3753
1240
        #posts - simple - expects the fields as they are
3754
1241
        if opts.data:
3755
1242
            self.curl_obj.setopt(pycurl.POST, True)
3756
1243
            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
3757
1244
            
3758
1245
        # our url
3759
1246
        self.curl_obj.setopt(pycurl.URL, self.url)
3760
1247
        
3761
1248
    
3762
1249
    def _do_perform(self):
3763
1250
        if self._complete:
3764
1251
            return
3765
1252
        
3766
1253
        try:
3767
1254
            self.curl_obj.perform()
3768
1255
        except pycurl.error, e:
3769
1256
            # XXX - break some of these out a bit more clearly
3770
1257
            # to other URLGrabErrors from 
3771
1258
            # http://curl.haxx.se/libcurl/c/libcurl-errors.html
3772
1259
            # this covers e.args[0] == 22 pretty well - which will be common
3773
1260
            
3774
1261
            code = self.http_code
3775
1262
            errcode = e.args[0]
3776
1263
            if self._error[0]:
3777
1264
                errcode = self._error[0]
3778
1265
                
3779
1266
            if errcode == 23 and code >= 200 and code < 299:
3780
1267
                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
3781
1268
                err.url = self.url
3782
1269
                
3783
1270
                # this is probably wrong but ultimately this is what happens
3784
1271
                # we have a legit http code and a pycurl 'writer failed' code
3785
1272
                # which almost always means something aborted it from outside
3786
1273
                # since we cannot know what it is -I'm banking on it being
3787
1274
                # a ctrl-c. XXXX - if there's a way of going back two raises to 
3788
1275
                # figure out what aborted the pycurl process FIXME
3789
1276
                raise KeyboardInterrupt
3790
1277
            
3791
1278
            elif errcode == 28:
3792
1279
                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
3793
1280
                err.url = self.url
3794
1281
                raise err
3795
1282
            elif errcode == 35:
3796
1283
                msg = _("problem making ssl connection")
3797
1284
                err = URLGrabError(14, msg)
3798
1285
                err.url = self.url
3799
1286
                raise err
3800
1287
            elif errcode == 37:
3801
1288
                msg = _("Could not open/read %s") % (self.url)
3802
1289
                err = URLGrabError(14, msg)
3803
1290
                err.url = self.url
3804
1291
                raise err
3805
1292
                
3806
1293
            elif errcode == 42:
3807
1294
                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
3808
1295
                err.url = self.url
3809
1296
                # this is probably wrong but ultimately this is what happens
3810
1297
                # we have a legit http code and a pycurl 'writer failed' code
3811
1298
                # which almost always means something aborted it from outside
3812
1299
                # since we cannot know what it is -I'm banking on it being
3813
1300
                # a ctrl-c. XXXX - if there's a way of going back two raises to 
3814
1301
                # figure out what aborted the pycurl process FIXME
3815
1302
                raise KeyboardInterrupt
3816
1303
                
3817
1304
            elif errcode == 58:
3818
1305
                msg = _("problem with the local client certificate")
3819
1306
                err = URLGrabError(14, msg)
3820
1307
                err.url = self.url
3821
1308
                raise err
3822
1309
3823
1310
            elif errcode == 60:
3824
1311
                msg = _("Peer cert cannot be verified or peer cert invalid")
3825
1312
                err = URLGrabError(14, msg)
3826
1313
                err.url = self.url
3827
1314
                raise err
3828
1315
            
3829
1316
            elif errcode == 63:
3830
1317
                if self._error[1]:
3831
1318
                    msg = self._error[1]
3832
1319
                else:
3833
1320
                    msg = _("Max download size exceeded on %s") % (self.url)
3834
1321
                err = URLGrabError(14, msg)
3835
1322
                err.url = self.url
3836
1323
                raise err
3837
1324
                    
3838
1325
            elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
3839
1326
                if self.scheme in ['http', 'https']:
3840
1327
                    msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
3841
1328
                elif self.scheme in ['ftp']:
3842
1329
                    msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
3843
1330
                else:
3844
1331
                    msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
3845
1332
            else:
3846
1333
                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
3847
1334
                code = errcode
3848
1335
            err = URLGrabError(14, msg)
3849
1336
            err.code = code
3850
1337
            err.exception = e
3851
1338
            raise err
3852
1339
        else:
3853
1340
            if self._error[1]:
3854
1341
                msg = self._error[1]
3855
1342
                err = URLGRabError(14, msg)
3856
1343
                err.url = self.url
3857
1344
                raise err
3858
1345
3859
1346
    def _do_open(self):
3860
1347
        self.curl_obj = _curl_cache
3861
1348
        self.curl_obj.reset() # reset all old settings away, just in case
3862
1349
        # setup any ranges
3863
1350
        self._set_opts()
3864
1351
        self._do_grab()
3865
1352
        return self.fo
3866
1353
3867
1354
    def _add_headers(self):
3868
1355
        pass
3869
1356
        
3870
1357
    def _build_range(self):
3871
1358
        reget_length = 0
3872
1359
        rt = None
3873
1360
        if self.opts.reget and type(self.filename) in types.StringTypes:
3874
1361
            # we have reget turned on and we're dumping to a file
3875
1362
            try:
3876
1363
                s = os.stat(self.filename)
3877
1364
            except OSError:
3878
1365
                pass
3879
1366
            else:
3880
1367
                self.reget_time = s[stat.ST_MTIME]
3881
1368
                reget_length = s[stat.ST_SIZE]
3882
1369
3883
1370
                # Set initial length when regetting
3884
1371
                self._amount_read = reget_length    
3885
1372
                self._reget_length = reget_length # set where we started from, too
3886
1373
3887
1374
                rt = reget_length, ''
3888
1375
                self.append = 1
3889
1376
                
3890
1377
        if self.opts.range:
3891
1378
            rt = self.opts.range
3892
1379
            if rt[0]: rt = (rt[0] + reget_length, rt[1])
3893
1380
3894
1381
        if rt:
3895
1382
            header = range_tuple_to_header(rt)
3896
1383
            if header:
3897
1384
                return header.split('=')[1]
3898
1385
3899
1386
3900
1387
3901
1388
    def _make_request(self, req, opener):
3902
1389
        #XXXX
3903
1390
        # This doesn't do anything really, but we could use this
3904
1391
        # instead of do_open() to catch a lot of crap errors as 
3905
1392
        # mstenner did before here
3906
1393
        return (self.fo, self.hdr)
3907
1394
        
3908
1395
        try:
3909
1396
            if self.opts.timeout:
3910
1397
                old_to = socket.getdefaulttimeout()
3911
1398
                socket.setdefaulttimeout(self.opts.timeout)
3912
1399
                try:
3913
1400
                    fo = opener.open(req)
3914
1401
                finally:
3915
1402
                    socket.setdefaulttimeout(old_to)
3916
1403
            else:
3917
1404
                fo = opener.open(req)
3918
1405
            hdr = fo.info()
3919
1406
        except ValueError, e:
3920
1407
            err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
3921
1408
            err.url = self.url
3922
1409
            raise err
3923
1410
3924
1411
        except RangeError, e:
3925
1412
            err = URLGrabError(9, _('%s on %s') % (e, self.url))
3926
1413
            err.url = self.url
3927
1414
            raise err
3928
1415
        except urllib2.HTTPError, e:
3929
1416
            new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
3930
1417
            new_e.code = e.code
3931
1418
            new_e.exception = e
3932
1419
            new_e.url = self.url
3933
1420
            raise new_e
3934
1421
        except IOError, e:
3935
1422
            if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
3936
1423
                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
3937
1424
                err.url = self.url
3938
1425
                raise err
3939
1426
            else:
3940
1427
                err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
3941
1428
                err.url = self.url
3942
1429
                raise err
3943
1430
3944
1431
        except OSError, e:
3945
1432
            err = URLGrabError(5, _('%s on %s') % (e, self.url))
3946
1433
            err.url = self.url
3947
1434
            raise err
3948
1435
3949
1436
        except HTTPException, e:
3950
1437
            err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
3951
1438
                            (e.__class__.__name__, self.url, e))
3952
1439
            err.url = self.url
3953
1440
            raise err
3954
1441
3955
1442
        else:
3956
1443
            return (fo, hdr)
3957
1444
        
3958
1445
    def _do_grab(self):
3959
1446
        """dump the file to a filename or StringIO buffer"""
3960
1447
3961
1448
        if self._complete:
3962
1449
            return
3963
1450
        _was_filename = False
3964
1451
        if type(self.filename) in types.StringTypes and self.filename:
3965
1452
            _was_filename = True
3966
1453
            self._prog_reportname = str(self.filename)
3967
1454
            self._prog_basename = os.path.basename(self.filename)
3968
1455
            
3969
1456
            if self.append: mode = 'ab'
3970
1457
            else: mode = 'wb'
3971
1458
3972
1459
            if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
3973
1460
                                 (self.filename, mode))
3974
1461
            try:
3975
1462
                self.fo = open(self.filename, mode)
3976
1463
            except IOError, e:
3977
1464
                err = URLGrabError(16, _(\
3978
1465
                  'error opening local file from %s, IOError: %s') % (self.url, e))
3979
1466
                err.url = self.url
3980
1467
                raise err
3981
1468
3982
1469
        else:
3983
1470
            self._prog_reportname = 'MEMORY'
3984
1471
            self._prog_basename = 'MEMORY'
3985
1472
3986
1473
            
3987
1474
            self.fo = StringIO()
3988
1475
            # if this is to be a tempfile instead....
3989
1476
            # it just makes crap in the tempdir
3990
1477
            #fh, self._temp_name = mkstemp()
3991
1478
            #self.fo = open(self._temp_name, 'wb')
3992
1479
3993
1480
            
3994
1481
        self._do_perform()
3995
1482
        
3996
1483
3997
1484
3998
1485
        if _was_filename:
3999
1486
            # close it up
4000
1487
            self.fo.flush()
4001
1488
            self.fo.close()
4002
1489
            # set the time
4003
1490
            mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
4004
1491
            if mod_time != -1:
4005
1492
                try:
4006
1493
                    os.utime(self.filename, (mod_time, mod_time))
4007
1494
                except OSError, e:
4008
1495
                    err = URLGrabError(16, _(\
4009
1496
                      'error setting timestamp on file %s from %s, OSError: %s') 
4010
1497
                              % (self.filenameself.url, e))
4011
1498
                    err.url = self.url
4012
1499
                    raise err
4013
1500
            # re open it
4014
1501
            try:
4015
1502
                self.fo = open(self.filename, 'r')
4016
1503
            except IOError, e:
4017
1504
                err = URLGrabError(16, _(\
4018
1505
                  'error opening file from %s, IOError: %s') % (self.url, e))
4019
1506
                err.url = self.url
4020
1507
                raise err
4021
1508
                
4022
1509
        else:
4023
1510
            #self.fo = open(self._temp_name, 'r')
4024
1511
            self.fo.seek(0)
4025
1512
4026
1513
        self._complete = True
4027
1514
    
4028
1515
    def _fill_buffer(self, amt=None):
4029
1516
        """fill the buffer to contain at least 'amt' bytes by reading
4030
1517
        from the underlying file object.  If amt is None, then it will
4031
1518
        read until it gets nothing more.  It updates the progress meter
4032
1519
        and throttles after every self._rbufsize bytes."""
4033
1520
        # the _rbuf test is only in this first 'if' for speed.  It's not
4034
1521
        # logically necessary
4035
1522
        if self._rbuf and not amt is None:
4036
1523
            L = len(self._rbuf)
4037
1524
            if amt > L:
4038
1525
                amt = amt - L
4039
1526
            else:
4040
1527
                return
4041
1528
4042
1529
        # if we've made it here, then we don't have enough in the buffer
4043
1530
        # and we need to read more.
4044
1531
        
4045
1532
        if not self._complete: self._do_grab() #XXX cheater - change on ranges
4046
1533
        
4047
1534
        buf = [self._rbuf]
4048
1535
        bufsize = len(self._rbuf)
4049
1536
        while amt is None or amt:
4050
1537
            # first, delay if necessary for throttling reasons
4051
1538
            if self.opts.raw_throttle():
4052
1539
                diff = self._tsize/self.opts.raw_throttle() - \
4053
1540
                       (time.time() - self._ttime)
4054
1541
                if diff > 0: time.sleep(diff)
4055
1542
                self._ttime = time.time()
4056
1543
                
4057
1544
            # now read some data, up to self._rbufsize
4058
1545
            if amt is None: readamount = self._rbufsize
4059
1546
            else:           readamount = min(amt, self._rbufsize)
4060
1547
            try:
4061
1548
                new = self.fo.read(readamount)
4062
1549
            except socket.error, e:
4063
1550
                err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
4064
1551
                err.url = self.url
4065
1552
                raise err
4066
1553
4067
1554
            except socket.timeout, e:
4068
1555
                raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
4069
1556
                err.url = self.url
4070
1557
                raise err
4071
1558
4072
1559
            except IOError, e:
4073
1560
                raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
4074
1561
                err.url = self.url
4075
1562
                raise err
4076
1563
4077
1564
            newsize = len(new)
4078
1565
            if not newsize: break # no more to read
4079
1566
4080
1567
            if amt: amt = amt - newsize
4081
1568
            buf.append(new)
4082
1569
            bufsize = bufsize + newsize
4083
1570
            self._tsize = newsize
4084
1571
            self._amount_read = self._amount_read + newsize
4085
1572
            #if self.opts.progress_obj:
4086
1573
            #    self.opts.progress_obj.update(self._amount_read)
4087
1574
4088
1575
        self._rbuf = string.join(buf, '')
4089
1576
        return
4090
1577
4091
1578
    def _progress_update(self, download_total, downloaded, upload_total, uploaded):
4092
1579
        if self._over_max_size(cur=self._amount_read-self._reget_length):
4093
1580
            return -1
4094
1581
4095
1582
        try:
4096
1583
            if self._prog_running:
4097
1584
                downloaded += self._reget_length
4098
1585
                self.opts.progress_obj.update(downloaded)
4099
1586
        except KeyboardInterrupt:
4100
1587
            return -1
4101
1588
    
4102
1589
    def _over_max_size(self, cur, max_size=None):
4103
1590
4104
1591
        if not max_size:
4105
1592
            if not self.opts.size:
4106
1593
                max_size = self.size
4107
1594
            else:
4108
1595
                max_size = self.opts.size
4109
1596
4110
1597
        if not max_size: return False # if we have None for all of the Max then this is dumb
4111
1598
4112
1599
        if cur > int(float(max_size) * 1.10):
4113
1600
4114
1601
            msg = _("Downloaded more than max size for %s: %s > %s") \
4115
1602
                        % (self.url, cur, max_size)
4116
1603
            self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
4117
1604
            return True
4118
1605
        return False
4119
1606
        
4120
1607
    def _to_utf8(self, obj, errors='replace'):
4121
1608
        '''convert 'unicode' to an encoded utf-8 byte string '''
4122
1609
        # stolen from yum.i18n
4123
1610
        if isinstance(obj, unicode):
4124
1611
            obj = obj.encode('utf-8', errors)
4125
1612
        return obj
4126
1613
        
4127
1614
    def read(self, amt=None):
4128
1615
        self._fill_buffer(amt)
4129
1616
        if amt is None:
4130
1617
            s, self._rbuf = self._rbuf, ''
4131
1618
        else:
4132
1619
            s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
4133
1620
        return s
4134
1621
4135
1622
    def readline(self, limit=-1):
4136
1623
        if not self._complete: self._do_grab()
4137
1624
        return self.fo.readline()
4138
1625
        
4139
1626
        i = string.find(self._rbuf, '\n')
4140
1627
        while i < 0 and not (0 < limit <= len(self._rbuf)):
4141
1628
            L = len(self._rbuf)
4142
1629
            self._fill_buffer(L + self._rbufsize)
4143
1630
            if not len(self._rbuf) > L: break
4144
1631
            i = string.find(self._rbuf, '\n', L)
4145
1632
4146
1633
        if i < 0: i = len(self._rbuf)
4147
1634
        else: i = i+1
4148
1635
        if 0 <= limit < len(self._rbuf): i = limit
4149
1636
4150
1637
        s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
4151
1638
        return s
4152
1639
4153
1640
    def close(self):
4154
1641
        if self._prog_running:
4155
1642
            self.opts.progress_obj.end(self._amount_read)
4156
1643
        self.fo.close()
4157
1644
        
4158
1645
    def geturl(self):
4159
1646
        """ Provide the geturl() method, used to be got from
4160
1647
            urllib.addinfourl, via. urllib.URLopener.* """
4161
1648
        return self.url
4162
1649
        
4163
1650
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
4164
1651
4165
1652
def reset_curl_obj():
4166
1653
    """To make sure curl has reread the network/dns info we force a reload"""
4167
1654
    global _curl_cache
4168
1655
    _curl_cache.close()
4169
1656
    _curl_cache = pycurl.Curl()
4170
1657
4171
1658
4172
1659
    
4173
1660
4174
1661
#####################################################################
4175
1662
# DEPRECATED FUNCTIONS
4176
1663
def set_throttle(new_throttle):
4177
1664
    """Deprecated. Use: default_grabber.throttle = new_throttle"""
4178
1665
    default_grabber.throttle = new_throttle
4179
1666
4180
1667
def set_bandwidth(new_bandwidth):
4181
1668
    """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
4182
1669
    default_grabber.bandwidth = new_bandwidth
4183
1670
4184
1671
def set_progress_obj(new_progress_obj):
4185
1672
    """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
4186
1673
    default_grabber.progress_obj = new_progress_obj
4187
1674
4188
1675
def set_user_agent(new_user_agent):
4189
1676
    """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
4190
1677
    default_grabber.user_agent = new_user_agent
4191
1678
    
4192
1679
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
4193
1680
              progress_obj=None, throttle=None, bandwidth=None,
4194
1681
              numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
4195
1682
    """Deprecated. Use: urlgrab() with the retry arg instead"""
4196
1683
    kwargs = {'copy_local' :  copy_local, 
4197
1684
              'close_connection' : close_connection,
4198
1685
              'progress_obj' : progress_obj, 
4199
1686
              'throttle' : throttle, 
4200
1687
              'bandwidth' : bandwidth,
4201
1688
              'retry' : numtries,
4202
1689
              'retrycodes' : retrycodes,
4203
1690
              'checkfunc' : checkfunc 
4204
1691
              }
4205
1692
    return urlgrab(url, filename, **kwargs)
4206
1693
4207
1694
        
4208
1695
#####################################################################
4209
1696
#  TESTING
4210
1697
def _main_test():
4211
1698
    try: url, filename = sys.argv[1:3]
4212
1699
    except ValueError:
4213
1700
        print 'usage:', sys.argv[0], \
4214
1701
              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
4215
1702
        sys.exit()
4216
1703
4217
1704
    kwargs = {}
4218
1705
    for a in sys.argv[3:]:
4219
1706
        k, v = string.split(a, '=', 1)
4220
1707
        kwargs[k] = int(v)
4221
1708
4222
1709
    set_throttle(1.0)
4223
1710
    set_bandwidth(32 * 1024)
4224
1711
    print "throttle: %s,  throttle bandwidth: %s B/s" % (default_grabber.throttle, 
4225
1712
                                                        default_grabber.bandwidth)
4226
1713
4227
1714
    try: from progress import text_progress_meter
4228
1715
    except ImportError, e: pass
4229
1716
    else: kwargs['progress_obj'] = text_progress_meter()
4230
1717
4231
1718
    try: name = apply(urlgrab, (url, filename), kwargs)
4232
1719
    except URLGrabError, e: print e
4233
1720
    else: print 'LOCAL FILE:', name
4234
1721
4235
1722
4236
1723
def _retry_test():
4237
1724
    try: url, filename = sys.argv[1:3]
4238
1725
    except ValueError:
4239
1726
        print 'usage:', sys.argv[0], \
4240
1727
              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
4241
1728
        sys.exit()
4242
1729
4243
1730
    kwargs = {}
4244
1731
    for a in sys.argv[3:]:
4245
1732
        k, v = string.split(a, '=', 1)
4246
1733
        kwargs[k] = int(v)
4247
1734
4248
1735
    try: from progress import text_progress_meter
4249
1736
    except ImportError, e: pass
4250
1737
    else: kwargs['progress_obj'] = text_progress_meter()
4251
1738
4252
1739
    def cfunc(filename, hello, there='foo'):
4253
1740
        print hello, there
4254
1741
        import random
4255
1742
        rnum = random.random()
4256
1743
        if rnum < .5:
4257
1744
            print 'forcing retry'
4258
1745
            raise URLGrabError(-1, 'forcing retry')
4259
1746
        if rnum < .75:
4260
1747
            print 'forcing failure'
4261
1748
            raise URLGrabError(-2, 'forcing immediate failure')
4262
1749
        print 'success'
4263
1750
        return
4264
1751
        
4265
1752
    kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
4266
1753
    try: name = apply(retrygrab, (url, filename), kwargs)
4267
1754
    except URLGrabError, e: print e
4268
1755
    else: print 'LOCAL FILE:', name
4269
1756
4270
1757
def _file_object_test(filename=None):
4271
1758
    import cStringIO
4272
1759
    if filename is None:
4273
1760
        filename = __file__
4274
1761
    print 'using file "%s" for comparisons' % filename
4275
1762
    fo = open(filename)
4276
1763
    s_input = fo.read()
4277
1764
    fo.close()
4278
1765
4279
1766
    for testfunc in [_test_file_object_smallread,
4280
1767
                     _test_file_object_readall,
4281
1768
                     _test_file_object_readline,
4282
1769
                     _test_file_object_readlines]:
4283
1770
        fo_input = cStringIO.StringIO(s_input)
4284
1771
        fo_output = cStringIO.StringIO()
4285
1772
        wrapper = PyCurlFileObject(fo_input, None, 0)
4286
1773
        print 'testing %-30s ' % testfunc.__name__,
4287
1774
        testfunc(wrapper, fo_output)
4288
1775
        s_output = fo_output.getvalue()
4289
1776
        if s_output == s_input: print 'passed'
4290
1777
        else: print 'FAILED'
4291
1778
            
4292
1779
def _test_file_object_smallread(wrapper, fo_output):
4293
1780
    while 1:
4294
1781
        s = wrapper.read(23)
4295
1782
        fo_output.write(s)
4296
1783
        if not s: return
4297
1784
4298
1785
def _test_file_object_readall(wrapper, fo_output):
4299
1786
    s = wrapper.read()
4300
1787
    fo_output.write(s)
4301
1788
4302
1789
def _test_file_object_readline(wrapper, fo_output):
4303
1790
    while 1:
4304
1791
        s = wrapper.readline()
4305
1792
        fo_output.write(s)
4306
1793
        if not s: return
4307
1794
4308
1795
def _test_file_object_readlines(wrapper, fo_output):
4309
1796
    li = wrapper.readlines()
4310
1797
    fo_output.write(string.join(li, ''))
4311
1798
4312
1799
if __name__ == '__main__':
4313
1800
    _main_test()
4314
1801
    _retry_test()
4315
1802
    _file_object_test('test')
4316
1803
0
4317
=== modified file 'ChangeLog'
4318
--- ChangeLog	2010-06-21 20:36:19 +0000
4319
+++ ChangeLog	2014-12-13 22:24:13 +0000
4320
@@ -1,3 +1,11 @@
4321
1
2013-10-09  Zdenek Pavlas <zpavlas@redhat.com>
4322
2
4323
3
	* lots of enahncements and bugfixes
4324
4
	  (parallel downloading, mirror profiling, new options)
4325
5
	* updated authors, url
4326
6
	* updated unit tests
4327
7
	* bump version to 3.10
4328
8
4329
1
2009-09-25  Seth Vidal <skvidal@fedoraproject.org>
9
2009-09-25  Seth Vidal <skvidal@fedoraproject.org>
4330
2
10
4331
3
	* urlgrabber/__init__.py: bump version to 3.9.1
11
	* urlgrabber/__init__.py: bump version to 3.9.1
4332
4
12
4333
=== modified file 'MANIFEST'
4334
--- MANIFEST	2010-06-21 20:36:19 +0000
4335
+++ MANIFEST	2014-12-13 22:24:13 +0000
4336
@@ -1,3 +1,4 @@
4337
1
# file GENERATED by distutils, do NOT edit
4338
1
ChangeLog
2
ChangeLog
4339
2
LICENSE
3
LICENSE
4340
3
MANIFEST
4
MANIFEST
4341
@@ -6,6 +7,7 @@
4342
6
makefile
7
makefile
4343
7
setup.py
8
setup.py
4344
8
scripts/urlgrabber
9
scripts/urlgrabber
4345
10
scripts/urlgrabber-ext-down
4346
9
test/base_test_code.py
11
test/base_test_code.py
4347
10
test/grabberperf.py
12
test/grabberperf.py
4348
11
test/munittest.py
13
test/munittest.py
4349
12
14
4350
=== modified file 'PKG-INFO'
4351
--- PKG-INFO	2010-06-21 20:36:19 +0000
4352
+++ PKG-INFO	2014-12-13 22:24:13 +0000
4353
@@ -1,37 +1,37 @@
4355
1
Metadata-Version: 1.0
1
Metadata-Version: 1.1
4356
2
Name: urlgrabber
2
Name: urlgrabber
4358
3
Version: 3.9.1
3
Version: 3.10.1
4359
4
Summary: A high-level cross-protocol url-grabber
4
Summary: A high-level cross-protocol url-grabber
4361
5
Home-page: http://linux.duke.edu/projects/urlgrabber/
5
Home-page: http://urlgrabber.baseurl.org/
4362
6
Author: Michael D. Stenner, Ryan Tomayko
6
Author: Michael D. Stenner, Ryan Tomayko
4364
7
Author-email: mstenner@linux.duke.edu, skvidal@fedoraproject.org
7
Author-email: mstenner@linux.duke.edu, zpavlas@redhat.com
4365
8
License: LGPL
8
License: LGPL
4366
9
Description: A high-level cross-protocol url-grabber.
9
Description: A high-level cross-protocol url-grabber.
4367
10
        
10
        
4368
11
        Using urlgrabber, data can be fetched in three basic ways:
11
        Using urlgrabber, data can be fetched in three basic ways:
4369
12
        
12
        
4374
13
        urlgrab(url) copy the file to the local filesystem
13
          urlgrab(url) copy the file to the local filesystem
4375
14
        urlopen(url) open the remote file and return a file object
14
          urlopen(url) open the remote file and return a file object
4376
15
        (like urllib2.urlopen)
15
             (like urllib2.urlopen)
4377
16
        urlread(url) return the contents of the file as a string
16
          urlread(url) return the contents of the file as a string
4378
17
        
17
        
4379
18
        When using these functions (or methods), urlgrabber supports the
18
        When using these functions (or methods), urlgrabber supports the
4380
19
        following features:
19
        following features:
4381
20
        
20
        
4396
21
        * identical behavior for http://, ftp://, and file:// urls
21
          * identical behavior for http://, ftp://, and file:// urls
4397
22
        * http keepalive - faster downloads of many files by using
22
          * http keepalive - faster downloads of many files by using
4398
23
        only a single connection
23
            only a single connection
4399
24
        * byte ranges - fetch only a portion of the file
24
          * byte ranges - fetch only a portion of the file
4400
25
        * reget - for a urlgrab, resume a partial download
25
          * reget - for a urlgrab, resume a partial download
4401
26
        * progress meters - the ability to report download progress
26
          * progress meters - the ability to report download progress
4402
27
        automatically, even when using urlopen!
27
            automatically, even when using urlopen!
4403
28
        * throttling - restrict bandwidth usage
28
          * throttling - restrict bandwidth usage
4404
29
        * retries - automatically retry a download if it fails. The
29
          * retries - automatically retry a download if it fails. The
4405
30
        number of retries and failure types are configurable.
30
            number of retries and failure types are configurable.
4406
31
        * authenticated server access for http and ftp
31
          * authenticated server access for http and ftp
4407
32
        * proxy support - support for authenticated http and ftp proxies
32
          * proxy support - support for authenticated http and ftp proxies
4408
33
        * mirror groups - treat a list of mirrors as a single source,
33
          * mirror groups - treat a list of mirrors as a single source,
4409
34
        automatically switching mirrors if there is a failure.
34
            automatically switching mirrors if there is a failure.
4410
35
        
35
        
4411
36
Platform: UNKNOWN
36
Platform: UNKNOWN
4412
37
Classifier: Development Status :: 4 - Beta
37
Classifier: Development Status :: 4 - Beta
4413
38
38
4414
=== modified file 'README'
4415
--- README	2005-10-23 12:29:28 +0000
4416
+++ README	2014-12-13 22:24:13 +0000
4417
@@ -19,7 +19,7 @@
4418
19
   python setup.py bdist_rpm 
19
   python setup.py bdist_rpm 
4419
20
20
4420
21
The rpms (both source and "binary") will be specific to the current
21
The rpms (both source and "binary") will be specific to the current
4422
22
distrubution/version and may not be portable to others.  This is
22
distribution/version and may not be portable to others.  This is
4423
23
because they will be built for the currently installed python.
23
because they will be built for the currently installed python.
4424
24
24
4425
25
keepalive.py and byterange.py are generic urllib2 extension modules and
25
keepalive.py and byterange.py are generic urllib2 extension modules and
4426
26
26
4427
=== modified file 'debian/changelog'
4428
--- debian/changelog	2014-02-23 13:54:39 +0000
4429
+++ debian/changelog	2014-12-13 22:24:13 +0000
4430
@@ -1,3 +1,10 @@
4431
1
urlgrabber (3.10.1-0ubuntu1) vivid; urgency=medium
4432
2
4433
3
  * New upstream release.
4434
4
  * Drop all patches, fixed upstream
4435
5
4436
6
 -- Jackson Doak <noskcaj@ubuntu.com>  Sun, 14 Dec 2014 09:12:57 +1100
4437
7
4438
1
urlgrabber (3.9.1-4ubuntu3) trusty; urgency=medium
8
urlgrabber (3.9.1-4ubuntu3) trusty; urgency=medium
4439
2
9
4440
3
  * Rebuild to drop files installed into /usr/share/pyshared.
10
  * Rebuild to drop files installed into /usr/share/pyshared.
4441
4
11
4442
=== removed file 'debian/patches/grabber_fix.diff'
4443
--- debian/patches/grabber_fix.diff	2010-07-08 17:40:08 +0000
4444
+++ debian/patches/grabber_fix.diff	1970-01-01 00:00:00 +0000
4445
@@ -1,236 +0,0 @@
4446
1
--- urlgrabber-3.9.1/urlgrabber/grabber.py.orig	2010-07-02 21:24:12.000000000 -0400
4447
2
+++ urlgrabber-3.9.1/urlgrabber/grabber.py	2010-07-02 20:30:25.000000000 -0400
4448
3
@@ -68,14 +68,14 @@
4449
4
     (which can be set on default_grabber.throttle) is used. See
4450
5
     BANDWIDTH THROTTLING for more information.
4451
6
 
4452
7
-  timeout = None
4453
8
+  timeout = 300
4454
9
 
4455
10
-    a positive float expressing the number of seconds to wait for socket
4456
11
-    operations. If the value is None or 0.0, socket operations will block
4457
12
-    forever. Setting this option causes urlgrabber to call the settimeout
4458
13
-    method on the Socket object used for the request. See the Python
4459
14
-    documentation on settimeout for more information.
4460
15
-    http://www.python.org/doc/current/lib/socket-objects.html
4461
16
+    a positive integer expressing the number of seconds to wait before
4462
17
+    timing out attempts to connect to a server. If the value is None
4463
18
+    or 0, connection attempts will not time out. The timeout is passed
4464
19
+    to the underlying pycurl object as its CONNECTTIMEOUT option, see
4465
20
+    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
4466
21
+    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
4467
22
 
4468
23
   bandwidth = 0
4469
24
 
4470
25
@@ -439,6 +439,12 @@
4471
26
 except:
4472
27
     __version__ = '???'
4473
28
 
4474
29
+try:
4475
30
+    # this part isn't going to do much - need to talk to gettext
4476
31
+    from i18n import _
4477
32
+except ImportError, msg:
4478
33
+    def _(st): return st
4479
34
+    
4480
35
 ########################################################################
4481
36
 # functions for debugging output.  These functions are here because they
4482
37
 # are also part of the module initialization.
4483
38
@@ -808,7 +814,7 @@
4484
39
         self.prefix = None
4485
40
         self.opener = None
4486
41
         self.cache_openers = True
4487
42
-        self.timeout = None
4488
43
+        self.timeout = 300
4489
44
         self.text = None
4490
45
         self.http_headers = None
4491
46
         self.ftp_headers = None
4492
47
@@ -1052,9 +1058,15 @@
4493
48
         self._reget_length = 0
4494
49
         self._prog_running = False
4495
50
         self._error = (None, None)
4496
51
-        self.size = None
4497
52
+        self.size = 0
4498
53
+        self._hdr_ended = False
4499
54
         self._do_open()
4500
55
         
4501
56
+
4502
57
+    def geturl(self):
4503
58
+        """ Provide the geturl() method, used to be got from
4504
59
+            urllib.addinfourl, via. urllib.URLopener.* """
4505
60
+        return self.url
4506
61
         
4507
62
     def __getattr__(self, name):
4508
63
         """This effectively allows us to wrap at the instance level.
4509
64
@@ -1085,9 +1097,14 @@
4510
65
             return -1
4511
66
             
4512
67
     def _hdr_retrieve(self, buf):
4513
68
+        if self._hdr_ended:
4514
69
+            self._hdr_dump = ''
4515
70
+            self.size = 0
4516
71
+            self._hdr_ended = False
4517
72
+
4518
73
         if self._over_max_size(cur=len(self._hdr_dump), 
4519
74
                                max_size=self.opts.max_header_size):
4520
75
-            return -1            
4521
76
+            return -1
4522
77
         try:
4523
78
             self._hdr_dump += buf
4524
79
             # we have to get the size before we do the progress obj start
4525
80
@@ -1104,7 +1121,17 @@
4526
81
                     s = parse150(buf)
4527
82
                 if s:
4528
83
                     self.size = int(s)
4529
84
-            
4530
85
+                    
4531
86
+            if buf.lower().find('location') != -1:
4532
87
+                location = ':'.join(buf.split(':')[1:])
4533
88
+                location = location.strip()
4534
89
+                self.scheme = urlparse.urlsplit(location)[0]
4535
90
+                self.url = location
4536
91
+                
4537
92
+            if len(self._hdr_dump) != 0 and buf == '\r\n':
4538
93
+                self._hdr_ended = True
4539
94
+                if DEBUG: DEBUG.info('header ended:')
4540
95
+                
4541
96
             return len(buf)
4542
97
         except KeyboardInterrupt:
4543
98
             return pycurl.READFUNC_ABORT
4544
99
@@ -1113,8 +1140,10 @@
4545
100
         if self._parsed_hdr:
4546
101
             return self._parsed_hdr
4547
102
         statusend = self._hdr_dump.find('\n')
4548
103
+        statusend += 1 # ridiculous as it may seem.
4549
104
         hdrfp = StringIO()
4550
105
         hdrfp.write(self._hdr_dump[statusend:])
4551
106
+        hdrfp.seek(0)
4552
107
         self._parsed_hdr =  mimetools.Message(hdrfp)
4553
108
         return self._parsed_hdr
4554
109
     
4555
110
@@ -1136,6 +1165,7 @@
4556
111
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
4557
112
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
4558
113
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
4559
114
+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
4560
115
         
4561
116
         if DEBUG:
4562
117
             self.curl_obj.setopt(pycurl.VERBOSE, True)
4563
118
@@ -1148,9 +1178,11 @@
4564
119
         
4565
120
         # timeouts
4566
121
         timeout = 300
4567
122
-        if opts.timeout:
4568
123
-            timeout = int(opts.timeout)
4569
124
-            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
4570
125
+        if hasattr(opts, 'timeout'):
4571
126
+            timeout = int(opts.timeout or 0)
4572
127
+        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
4573
128
+        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
4574
129
+        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
4575
130
 
4576
131
         # ssl options
4577
132
         if self.scheme == 'https':
4578
133
@@ -1276,7 +1308,7 @@
4579
134
                 raise err
4580
135
 
4581
136
             elif errcode == 60:
4582
137
-                msg = _("client cert cannot be verified or client cert incorrect")
4583
138
+                msg = _("Peer cert cannot be verified or peer cert invalid")
4584
139
                 err = URLGrabError(14, msg)
4585
140
                 err.url = self.url
4586
141
                 raise err
4587
142
@@ -1291,7 +1323,12 @@
4588
143
                 raise err
4589
144
                     
4590
145
             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
4591
146
-                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
4592
147
+                if self.scheme in ['http', 'https']:
4593
148
+                    msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
4594
149
+                elif self.scheme in ['ftp']:
4595
150
+                    msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
4596
151
+                else:
4597
152
+                    msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
4598
153
             else:
4599
154
                 msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
4600
155
                 code = errcode
4601
156
@@ -1299,6 +1336,12 @@
4602
157
             err.code = code
4603
158
             err.exception = e
4604
159
             raise err
4605
160
+        else:
4606
161
+            if self._error[1]:
4607
162
+                msg = self._error[1]
4608
163
+                err = URLGRabError(14, msg)
4609
164
+                err.url = self.url
4610
165
+                raise err
4611
166
 
4612
167
     def _do_open(self):
4613
168
         self.curl_obj = _curl_cache
4614
169
@@ -1446,9 +1489,23 @@
4615
170
             # set the time
4616
171
             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
4617
172
             if mod_time != -1:
4618
173
-                os.utime(self.filename, (mod_time, mod_time))
4619
174
+                try:
4620
175
+                    os.utime(self.filename, (mod_time, mod_time))
4621
176
+                except OSError, e:
4622
177
+                    err = URLGrabError(16, _(\
4623
178
+                      'error setting timestamp on file %s from %s, OSError: %s') 
4624
179
+                              % (self.filenameself.url, e))
4625
180
+                    err.url = self.url
4626
181
+                    raise err
4627
182
             # re open it
4628
183
-            self.fo = open(self.filename, 'r')
4629
184
+            try:
4630
185
+                self.fo = open(self.filename, 'r')
4631
186
+            except IOError, e:
4632
187
+                err = URLGrabError(16, _(\
4633
188
+                  'error opening file from %s, IOError: %s') % (self.url, e))
4634
189
+                err.url = self.url
4635
190
+                raise err
4636
191
+                
4637
192
         else:
4638
193
             #self.fo = open(self._temp_name, 'r')
4639
194
             self.fo.seek(0)
4640
195
@@ -1532,11 +1589,14 @@
4641
196
     def _over_max_size(self, cur, max_size=None):
4642
197
 
4643
198
         if not max_size:
4644
199
-            max_size = self.size
4645
200
-        if self.opts.size: # if we set an opts size use that, no matter what
4646
201
-            max_size = self.opts.size
4647
202
+            if not self.opts.size:
4648
203
+                max_size = self.size
4649
204
+            else:
4650
205
+                max_size = self.opts.size
4651
206
+
4652
207
         if not max_size: return False # if we have None for all of the Max then this is dumb
4653
208
-        if cur > max_size + max_size*.10:
4654
209
+
4655
210
+        if cur > int(float(max_size) * 1.10):
4656
211
 
4657
212
             msg = _("Downloaded more than max size for %s: %s > %s") \
4658
213
                         % (self.url, cur, max_size)
4659
214
@@ -1582,9 +1642,21 @@
4660
215
             self.opts.progress_obj.end(self._amount_read)
4661
216
         self.fo.close()
4662
217
         
4663
218
-
4664
219
+    def geturl(self):
4665
220
+        """ Provide the geturl() method, used to be got from
4666
221
+            urllib.addinfourl, via. urllib.URLopener.* """
4667
222
+        return self.url
4668
223
+        
4669
224
 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
4670
225
 
4671
226
+def reset_curl_obj():
4672
227
+    """To make sure curl has reread the network/dns info we force a reload"""
4673
228
+    global _curl_cache
4674
229
+    _curl_cache.close()
4675
230
+    _curl_cache = pycurl.Curl()
4676
231
+
4677
232
+
4678
233
+    
4679
234
 
4680
235
 #####################################################################
4681
236
 # DEPRECATED FUNCTIONS
4682
237
0
4683
=== removed file 'debian/patches/progress_fix.diff'
4684
--- debian/patches/progress_fix.diff	2010-07-08 17:40:08 +0000
4685
+++ debian/patches/progress_fix.diff	1970-01-01 00:00:00 +0000
4686
@@ -1,11 +0,0 @@
4687
1
--- urlgrabber-3.9.1/urlgrabber/progress.py.orig	2010-07-02 21:25:51.000000000 -0400
4688
2
+++ urlgrabber-3.9.1/urlgrabber/progress.py	2010-07-02 20:30:25.000000000 -0400
4689
3
@@ -658,6 +658,8 @@
4690
4
     if seconds is None or seconds < 0:
4691
5
         if use_hours: return '--:--:--'
4692
6
         else:         return '--:--'
4693
7
+    elif seconds == float('inf'):
4694
8
+        return 'Infinite'
4695
9
     else:
4696
10
         seconds = int(seconds)
4697
11
         minutes = seconds / 60
4698
12
0
4699
=== removed file 'debian/patches/progress_object_callback_fix.diff'
4700
--- debian/patches/progress_object_callback_fix.diff	2011-08-09 17:45:08 +0000
4701
+++ debian/patches/progress_object_callback_fix.diff	1970-01-01 00:00:00 +0000
4702
@@ -1,21 +0,0 @@
4703
1
From: James Antill <james@and.org>
4704
2
Date: Thu, 19 May 2011 20:17:14 +0000 (-0400)
4705
3
Subject: Fix documentation for progress_object callback.
4706
4
X-Git-Url: http://yum.baseurl.org/gitweb?p=urlgrabber.git;a=commitdiff_plain;h=674d545ee303aa99701ffb982536851572d8db77
4707
5
4708
6
Fix documentation for progress_object callback.
4709
7
---
4710
8
4711
9
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
4712
10
index 36212cf..f6f57bd 100644
4713
11
--- a/urlgrabber/grabber.py
4714
12
+++ b/urlgrabber/grabber.py
4715
13
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
4716
14
   progress_obj = None
4717
15
 
4718
16
     a class instance that supports the following methods:
4719
17
-      po.start(filename, url, basename, length, text)
4720
18
+      po.start(filename, url, basename, size, now, text)
4721
19
       # length will be None if unknown
4722
20
       po.update(read) # read == bytes read so far
4723
21
       po.end()
4724
22
0
4725
=== modified file 'debian/patches/series'
4726
--- debian/patches/series	2011-08-09 17:45:08 +0000
4727
+++ debian/patches/series	2014-12-13 22:24:13 +0000
4728
@@ -1,3 +0,0 @@
4729
1
grabber_fix.diff
4730
2
progress_fix.diff
4731
3
progress_object_callback_fix.diff
4732
4
0
4733
=== modified file 'scripts/urlgrabber'
4734
--- scripts/urlgrabber	2010-06-21 20:36:19 +0000
4735
+++ scripts/urlgrabber	2014-12-13 22:24:13 +0000
4736
@@ -115,6 +115,7 @@
4737
115
                    including quotes in the case of strings.
115
                    including quotes in the case of strings.
4738
116
                    e.g.  --user_agent='"foobar/2.0"'
116
                    e.g.  --user_agent='"foobar/2.0"'
4739
117
117
4740
118
  --output FILE
4741
118
  -o FILE           write output to FILE, otherwise the basename of the
119
  -o FILE           write output to FILE, otherwise the basename of the
4742
119
                    url will be used
120
                    url will be used
4743
120
  -O                print the names of saved files to STDOUT
121
  -O                print the names of saved files to STDOUT
4744
@@ -170,12 +171,17 @@
4745
170
        return ug_options, ug_defaults
171
        return ug_options, ug_defaults
4746
171
172
4747
172
    def process_command_line(self):
173
    def process_command_line(self):
4749
173
        short_options = 'vd:hoOpD'
174
        short_options = 'vd:ho:OpD'
4750
174
        long_options = ['profile', 'repeat=', 'verbose=',
175
        long_options = ['profile', 'repeat=', 'verbose=',
4752
175
                        'debug=', 'help', 'progress']
176
                        'debug=', 'help', 'progress', 'output=']
4753
176
        ug_long = [ o + '=' for o in self.ug_options ]
177
        ug_long = [ o + '=' for o in self.ug_options ]
4756
177
        optlist, args = getopt.getopt(sys.argv[1:], short_options,
178
        try:
4757
178
                                      long_options + ug_long)
179
            optlist, args = getopt.getopt(sys.argv[1:], short_options,
4758
180
                                          long_options + ug_long)
4759
181
        except getopt.GetoptError, e:
4760
182
            print >>sys.stderr, "Error:", e
4761
183
            self.help([], ret=1)
4762
184
4763
179
        self.verbose = 0
185
        self.verbose = 0
4764
180
        self.debug = None
186
        self.debug = None
4765
181
        self.outputfile = None
187
        self.outputfile = None
4766
@@ -193,6 +199,7 @@
4767
193
            if o == '--verbose': self.verbose = v
199
            if o == '--verbose': self.verbose = v
4768
194
            if o == '-v':        self.verbose += 1
200
            if o == '-v':        self.verbose += 1
4769
195
            if o == '-o':        self.outputfile = v
201
            if o == '-o':        self.outputfile = v
4770
202
            if o == '--output':  self.outputfile = v
4771
196
            if o == '-p' or o == '--progress': self.progress = 1
203
            if o == '-p' or o == '--progress': self.progress = 1
4772
197
            if o == '-d' or o == '--debug': self.debug = v
204
            if o == '-d' or o == '--debug': self.debug = v
4773
198
            if o == '--profile': self.profile = 1
205
            if o == '--profile': self.profile = 1
4774
@@ -222,7 +229,7 @@
4775
222
            print "ERROR: cannot use -o when grabbing multiple files"
229
            print "ERROR: cannot use -o when grabbing multiple files"
4776
223
            sys.exit(1)
230
            sys.exit(1)
4777
224
231
4779
225
    def help(self, args):
232
    def help(self, args, ret=0):
4780
226
        if not args:
233
        if not args:
4781
227
            print MAINHELP
234
            print MAINHELP
4782
228
        else:
235
        else:
4783
@@ -234,7 +241,7 @@
4784
234
                    self.help_ug_option(a)
241
                    self.help_ug_option(a)
4785
235
                else:
242
                else:
4786
236
                    print 'ERROR: no help on command "%s"' % a
243
                    print 'ERROR: no help on command "%s"' % a
4788
237
        sys.exit(0)
244
        sys.exit(ret)
4789
238
245
4790
239
    def help_doc(self):
246
    def help_doc(self):
4791
240
        print __doc__
247
        print __doc__
4792
@@ -294,6 +301,7 @@
4793
294
                if self.op.localfile: print f
301
                if self.op.localfile: print f
4794
295
            except URLGrabError, e:
302
            except URLGrabError, e:
4795
296
                print e
303
                print e
4796
304
                sys.exit(1)
4797
297
        
305
        
4798
298
    def set_debug_logger(self, dbspec):
306
    def set_debug_logger(self, dbspec):
4799
299
        try:
307
        try:
4800
300
308
4801
=== added file 'scripts/urlgrabber-ext-down'
4802
--- scripts/urlgrabber-ext-down	1970-01-01 00:00:00 +0000
4803
+++ scripts/urlgrabber-ext-down	2014-12-13 22:24:13 +0000
4804
@@ -0,0 +1,75 @@
4805
1
#! /usr/bin/python
4806
2
#  A very simple external downloader
4807
3
#  Copyright 2011-2012 Zdenek Pavlas
4808
4
4809
5
#   This library is free software; you can redistribute it and/or
4810
6
#   modify it under the terms of the GNU Lesser General Public
4811
7
#   License as published by the Free Software Foundation; either
4812
8
#   version 2.1 of the License, or (at your option) any later version.
4813
9
#
4814
10
#   This library is distributed in the hope that it will be useful,
4815
11
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
4816
12
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4817
13
#   Lesser General Public License for more details.
4818
14
#
4819
15
#   You should have received a copy of the GNU Lesser General Public
4820
16
#   License along with this library; if not, write to the
4821
17
#      Free Software Foundation, Inc.,
4822
18
#      59 Temple Place, Suite 330,
4823
19
#      Boston, MA  02111-1307  USA
4824
20
4825
21
import time, os, errno, sys
4826
22
from urlgrabber.grabber import \
4827
23
    _readlines, URLGrabberOptions, _loads, \
4828
24
    PyCurlFileObject, URLGrabError
4829
25
4830
26
def write(fmt, *arg):
4831
27
    try: os.write(1, fmt % arg)
4832
28
    except OSError, e:
4833
29
        if e.args[0] != errno.EPIPE: raise
4834
30
        sys.exit(1)
4835
31
4836
32
class ProxyProgress:
4837
33
    def start(self, *d1, **d2):
4838
34
        self.next_update = 0
4839
35
    def update(self, _amount_read):
4840
36
        t = time.time()
4841
37
        if t < self.next_update: return
4842
38
        self.next_update = t + 0.31
4843
39
        write('%d %d\n', self._id, _amount_read)
4844
40
4845
41
def main():
4846
42
    import signal
4847
43
    signal.signal(signal.SIGINT, lambda n, f: sys.exit(1))
4848
44
    cnt = 0
4849
45
    while True:
4850
46
        lines = _readlines(0)
4851
47
        if not lines: break
4852
48
        for line in lines:
4853
49
            cnt += 1
4854
50
            opts = URLGrabberOptions()
4855
51
            opts._id = cnt
4856
52
            for k in line.split(' '):
4857
53
                k, v = k.split('=', 1)
4858
54
                setattr(opts, k, _loads(v))
4859
55
            if opts.progress_obj:
4860
56
                opts.progress_obj = ProxyProgress()
4861
57
                opts.progress_obj._id = cnt
4862
58
4863
59
            dlsz = dltm = 0
4864
60
            try:
4865
61
                fo = PyCurlFileObject(opts.url, opts.filename, opts)
4866
62
                fo._do_grab()
4867
63
                fo.fo.close()
4868
64
                size = fo._amount_read
4869
65
                if fo._tm_last:
4870
66
                    dlsz = fo._tm_last[0] - fo._tm_first[0]
4871
67
                    dltm = fo._tm_last[1] - fo._tm_first[1]
4872
68
                ug_err = 'OK'
4873
69
            except URLGrabError, e:
4874
70
                size = 0
4875
71
                ug_err = '%d %d %s' % (e.errno, getattr(e, 'code', 0), e.strerror)
4876
72
            write('%d %d %d %.3f %s\n', opts._id, size, dlsz, dltm, ug_err)
4877
73
4878
74
if __name__ == '__main__':
4879
75
    main()
4880
0
76
4881
=== modified file 'setup.py'
4882
--- setup.py	2005-10-23 12:29:28 +0000
4883
+++ setup.py	2014-12-13 22:24:13 +0000
4884
@@ -15,8 +15,10 @@
4885
15
packages = ['urlgrabber']
15
packages = ['urlgrabber']
4886
16
package_dir = {'urlgrabber':'urlgrabber'}
16
package_dir = {'urlgrabber':'urlgrabber'}
4887
17
scripts = ['scripts/urlgrabber']
17
scripts = ['scripts/urlgrabber']
4890
18
data_files = [('share/doc/' + name + '-' + version,
18
data_files = [
4891
19
               ['README','LICENSE', 'TODO', 'ChangeLog'])]
19
    ('share/doc/' + name + '-' + version, ['README','LICENSE', 'TODO', 'ChangeLog']),
4892
20
    ('libexec', ['scripts/urlgrabber-ext-down']),
4893
21
]
4894
20
options = { 'clean' : { 'all' : 1 } }
22
options = { 'clean' : { 'all' : 1 } }
4895
21
classifiers = [
23
classifiers = [
4896
22
        'Development Status :: 4 - Beta',
24
        'Development Status :: 4 - Beta',
4897
23
25
4898
=== modified file 'test/base_test_code.py'
4899
--- test/base_test_code.py	2005-10-23 12:29:28 +0000
4900
+++ test/base_test_code.py	2014-12-13 22:24:13 +0000
4901
@@ -1,6 +1,6 @@
4902
1
from munittest import *
1
from munittest import *
4903
2
2
4905
3
base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
3
base_http = 'http://urlgrabber.baseurl.org/test/'
4906
4
base_ftp  = 'ftp://localhost/test/'
4
base_ftp  = 'ftp://localhost/test/'
4907
5
5
4908
6
# set to a proftp server only. we're working around a couple of
6
# set to a proftp server only. we're working around a couple of
4909
7
7
4910
=== modified file 'test/munittest.py'
4911
--- test/munittest.py	2005-10-23 12:29:28 +0000
4912
+++ test/munittest.py	2014-12-13 22:24:13 +0000
4913
@@ -113,7 +113,7 @@
4914
113
__all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner',
113
__all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner',
4915
114
           'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader']
114
           'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader']
4916
115
115
4918
116
# Expose obsolete functions for backwards compatability
116
# Expose obsolete functions for backwards compatibility
4919
117
__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases'])
117
__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases'])
4920
118
118
4921
119
119
4922
@@ -410,7 +410,7 @@
4923
410
           (default 7) and comparing to zero.
410
           (default 7) and comparing to zero.
4924
411
411
4925
412
           Note that decimal places (from zero) is usually not the same
412
           Note that decimal places (from zero) is usually not the same
4927
413
           as significant digits (measured from the most signficant digit).
413
           as significant digits (measured from the most significant digit).
4928
414
        """
414
        """
4929
415
        if round(second-first, places) != 0:
415
        if round(second-first, places) != 0:
4930
416
            raise self.failureException, \
416
            raise self.failureException, \
4931
@@ -422,7 +422,7 @@
4932
422
           (default 7) and comparing to zero.
422
           (default 7) and comparing to zero.
4933
423
423
4934
424
           Note that decimal places (from zero) is usually not the same
424
           Note that decimal places (from zero) is usually not the same
4936
425
           as significant digits (measured from the most signficant digit).
425
           as significant digits (measured from the most significant digit).
4937
426
        """
426
        """
4938
427
        if round(second-first, places) == 0:
427
        if round(second-first, places) == 0:
4939
428
            raise self.failureException, \
428
            raise self.failureException, \
4940
429
429
4941
=== modified file 'test/test_byterange.py'
4942
--- test/test_byterange.py	2005-10-23 12:29:28 +0000
4943
+++ test/test_byterange.py	2014-12-13 22:24:13 +0000
4944
@@ -25,7 +25,7 @@
4945
25
25
4946
26
import sys
26
import sys
4947
27
27
4949
28
from StringIO import StringIO
28
from cStringIO import StringIO
4950
29
from urlgrabber.byterange import RangeableFileObject
29
from urlgrabber.byterange import RangeableFileObject
4951
30
30
4952
31
from base_test_code import *
31
from base_test_code import *
4953
@@ -52,18 +52,6 @@
4954
52
        self.rfo.seek(1,1)
52
        self.rfo.seek(1,1)
4955
53
        self.assertEquals('of', self.rfo.read(2))
53
        self.assertEquals('of', self.rfo.read(2))
4956
54
    
54
    
4957
55
    def test_poor_mans_seek(self):
4958
56
        """RangeableFileObject.seek() poor mans version..
4959
57
        
4960
58
        We just delete the seek method from StringIO so we can
4961
59
        excercise RangeableFileObject when the file object supplied
4962
60
        doesn't support seek.
4963
61
        """
4964
62
        seek = StringIO.seek
4965
63
        del(StringIO.seek)
4966
64
        self.test_seek()
4967
65
        StringIO.seek = seek
4968
66
    
4969
67
    def test_read(self):
55
    def test_read(self):
4970
68
        """RangeableFileObject.read()"""
56
        """RangeableFileObject.read()"""
4971
69
        self.assertEquals('the', self.rfo.read(3))
57
        self.assertEquals('the', self.rfo.read(3))
4972
70
58
4973
=== modified file 'test/test_grabber.py'
4974
--- test/test_grabber.py	2010-06-21 20:36:19 +0000
4975
+++ test/test_grabber.py	2014-12-13 22:24:13 +0000
4976
@@ -86,7 +86,7 @@
4977
86
    
86
    
4978
87
class HTTPTests(TestCase):
87
class HTTPTests(TestCase):
4979
88
    def test_reference_file(self):
88
    def test_reference_file(self):
4981
89
        "download refernce file via HTTP"
89
        "download reference file via HTTP"
4982
90
        filename = tempfile.mktemp()
90
        filename = tempfile.mktemp()
4983
91
        grabber.urlgrab(ref_http, filename)
91
        grabber.urlgrab(ref_http, filename)
4984
92
92
4985
@@ -98,6 +98,7 @@
4986
98
98
4987
99
    def test_post(self):
99
    def test_post(self):
4988
100
        "do an HTTP post"
100
        "do an HTTP post"
4989
101
        self.skip() # disabled on server
4990
101
        headers = (('Content-type', 'text/plain'),)
102
        headers = (('Content-type', 'text/plain'),)
4991
102
        ret = grabber.urlread(base_http + 'test_post.php',
103
        ret = grabber.urlread(base_http + 'test_post.php',
4992
103
                              data=short_reference_data,
104
                              data=short_reference_data,
4993
104
105
4994
=== modified file 'test/test_mirror.py'
4995
--- test/test_mirror.py	2005-12-31 15:34:22 +0000
4996
+++ test/test_mirror.py	2014-12-13 22:24:13 +0000
4997
@@ -28,7 +28,7 @@
4998
28
import string, tempfile, random, cStringIO, os
28
import string, tempfile, random, cStringIO, os
4999
29
29
5000
30
import urlgrabber.grabber
30
import urlgrabber.grabber
Status:	Needs review
Proposed branch:	lp:~noskcaj/ubuntu/vivid/urlgrabber/3.10.1
Merge into:	lp:ubuntu/vivid/urlgrabber
Diff against target:	7325 lines (+1389/-4846) 26 files modified .pc/applied-patches (+0/-3) .pc/grabber_fix.diff/urlgrabber/grabber.py (+0/-1730) .pc/progress_fix.diff/urlgrabber/progress.py (+0/-755) .pc/progress_object_callback_fix.diff/urlgrabber/grabber.py (+0/-1802) ChangeLog (+8/-0) MANIFEST (+2/-0) PKG-INFO (+22/-22) README (+1/-1) debian/changelog (+7/-0) debian/patches/grabber_fix.diff (+0/-236) debian/patches/progress_fix.diff (+0/-11) debian/patches/progress_object_callback_fix.diff (+0/-21) debian/patches/series (+0/-3) scripts/urlgrabber (+14/-6) scripts/urlgrabber-ext-down (+75/-0) setup.py (+4/-2) test/base_test_code.py (+1/-1) test/munittest.py (+3/-3) test/test_byterange.py (+1/-13) test/test_grabber.py (+2/-1) test/test_mirror.py (+72/-1) urlgrabber/__init__.py (+5/-4) urlgrabber/byterange.py (+8/-8) urlgrabber/grabber.py (+901/-152) urlgrabber/mirror.py (+54/-11) urlgrabber/progress.py (+209/-60)
To merge this branch:	bzr merge lp:~noskcaj/ubuntu/vivid/urlgrabber/3.10.1
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
Daniel Holbach (community)		2014-12-13	Needs Fixing on 2014-12-16
Review via email: mp+244676@code.launchpad.net