From add16b8996c1e55801346bc6b216a9c3fc8337c3 Mon Sep 17 00:00:00 2001 From: Zdeněk Pavlas Date: Nov 01 2012 13:13:37 +0000 Subject: Update to latest HEAD --- diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 2975737..d046d31 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 20%{?dist} +Release: 21%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -44,6 +44,10 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Thu Nov 1 2012 Zdeněk Pavlas - 3.9.1-21 +- Update to latest HEAD. +- Get rid of "HTTP 200 OK" errors. BZ 871835. + * Tue Sep 4 2012 Zdeněk Pavlas - 3.9.1-20 - Update to latest HEAD. - Fixed BZ 851178, 854075. diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index ef304ad..55c3ba4 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644 return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..01218b0 100644 +index e090e90..74a692c 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) @@ -458,7 +458,7 @@ index e090e90..01218b0 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -527,6 +608,22 @@ def _(st): +@@ -527,6 +608,29 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## @@ -475,13 +475,20 @@ index e090e90..01218b0 100644 + obj = obj.encode('utf-8', errors) + return obj + ++def exception2msg(e): ++ try: ++ return str(e) ++ except UnicodeEncodeError: ++ # always use byte strings ++ return unicode(e).encode('utf8') ++ +######################################################################## +# END UTILITY FUNCTIONS +######################################################################## class URLGrabError(IOError): -@@ -662,6 +759,7 @@ class URLParser: +@@ -662,6 +766,7 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ @@ -489,7 +496,7 @@ index e090e90..01218b0 100644 quote = opts.quote if opts.prefix: -@@ -768,6 +866,41 @@ class URLGrabberOptions: +@@ -768,6 +873,41 @@ class URLGrabberOptions: else: # throttle is a float return self.bandwidth * self.throttle @@ -531,7 +538,7 @@ index e090e90..01218b0 100644 def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the -@@ -791,30 +924,37 @@ class URLGrabberOptions: +@@ -791,30 +931,37 @@ class URLGrabberOptions: provided here. """ self.progress_obj = None @@ -570,7 +577,7 @@ index e090e90..01218b0 100644 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb -@@ -827,6 +967,12 @@ class URLGrabberOptions: +@@ -827,6 +974,12 @@ class URLGrabberOptions: self.size = None # if we know how big the thing we're getting is going # to be. this is ultimately a MAXIMUM size for the file self.max_header_size = 2097152 #2mb seems reasonable for maximum header size @@ -583,7 +590,7 @@ index e090e90..01218b0 100644 def __repr__(self): return self.format() -@@ -846,7 +992,18 @@ class URLGrabberOptions: +@@ -846,7 +999,18 @@ class URLGrabberOptions: s = s + indent + '}' return s @@ -603,7 +610,7 @@ index e090e90..01218b0 100644 """Provides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when -@@ -872,7 +1029,6 @@ class URLGrabber: +@@ -872,7 +1036,6 @@ class URLGrabber: # beware of infinite loops :) tries = tries + 1 exception = None @@ -611,7 +618,7 @@ index e090e90..01218b0 100644 callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) -@@ -883,54 +1039,62 @@ class URLGrabber: +@@ -883,54 +1046,62 @@ class URLGrabber: except URLGrabError, e: exception = e callback = opts.failure_callback @@ -681,7 +688,7 @@ index e090e90..01218b0 100644 if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently -@@ -950,41 +1114,51 @@ class URLGrabber: +@@ -950,41 +1121,51 @@ class URLGrabber: elif not opts.range: if not opts.checkfunc is None: @@ -748,7 +755,7 @@ index e090e90..01218b0 100644 if limit is not None: limit = limit + 1 -@@ -1000,12 +1174,8 @@ class URLGrabber: +@@ -1000,12 +1181,8 @@ class URLGrabber: else: s = fo.read(limit) if not opts.checkfunc is None: @@ -763,7 +770,7 @@ index e090e90..01218b0 100644 finally: fo.close() return s -@@ -1020,6 +1190,7 @@ class URLGrabber: +@@ -1020,6 +1197,7 @@ class URLGrabber: return s def _make_callback(self, callback_obj): @@ -771,7 +778,7 @@ index e090e90..01218b0 100644 if callable(callback_obj): return callback_obj, (), {} else: -@@ -1030,7 +1201,7 @@ class URLGrabber: +@@ -1030,7 +1208,7 @@ class URLGrabber: default_grabber = URLGrabber() @@ -780,7 +787,7 @@ index e090e90..01218b0 100644 def __init__(self, url, filename, opts): self.fo = None self._hdr_dump = '' -@@ -1052,10 +1223,13 @@ class PyCurlFileObject(): +@@ -1052,10 +1230,13 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -796,7 +803,7 @@ index e090e90..01218b0 100644 def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for -@@ -1067,6 +1241,12 @@ class PyCurlFileObject(): +@@ -1067,6 +1248,12 @@ class PyCurlFileObject(): def _retrieve(self, buf): try: @@ -809,7 +816,18 @@ index e090e90..01218b0 100644 if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length -@@ -1085,9 +1265,14 @@ class PyCurlFileObject(): +@@ -1079,15 +1266,24 @@ class PyCurlFileObject(): + self.opts.progress_obj.update(self._amount_read) + + self._amount_read += len(buf) +- self.fo.write(buf) ++ try: ++ self.fo.write(buf) ++ except IOError, e: ++ self._cb_error = URLGrabError(16, exception2msg(e)) ++ return -1 + return len(buf) + except KeyboardInterrupt: return -1 def _hdr_retrieve(self, buf): @@ -825,7 +843,7 @@ index e090e90..01218b0 100644 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start -@@ -1104,7 +1289,17 @@ class PyCurlFileObject(): +@@ -1104,7 +1300,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) @@ -844,7 +862,7 @@ index e090e90..01218b0 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1308,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1319,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -855,7 +873,7 @@ index e090e90..01218b0 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1127,6 +1324,9 @@ class PyCurlFileObject(): +@@ -1127,6 +1335,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts @@ -865,7 +883,7 @@ index e090e90..01218b0 100644 # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) -@@ -1136,11 +1336,21 @@ class PyCurlFileObject(): +@@ -1136,11 +1347,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -887,7 +905,7 @@ index e090e90..01218b0 100644 # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) -@@ -1148,9 +1358,11 @@ class PyCurlFileObject(): +@@ -1148,9 +1369,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -897,12 +915,12 @@ index e090e90..01218b0 100644 + if hasattr(opts, 'timeout'): + timeout = int(opts.timeout or 0) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) -+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) ++ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1000) + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) # ssl options if self.scheme == 'https': -@@ -1158,13 +1370,16 @@ class PyCurlFileObject(): +@@ -1158,13 +1381,16 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) @@ -920,7 +938,7 @@ index e090e90..01218b0 100644 if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: -@@ -1187,28 +1402,26 @@ class PyCurlFileObject(): +@@ -1187,28 +1413,26 @@ class PyCurlFileObject(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) @@ -965,7 +983,7 @@ index e090e90..01218b0 100644 # our url self.curl_obj.setopt(pycurl.URL, self.url) -@@ -1228,12 +1441,14 @@ class PyCurlFileObject(): +@@ -1228,39 +1452,36 @@ class PyCurlFileObject(): code = self.http_code errcode = e.args[0] @@ -974,16 +992,19 @@ index e090e90..01218b0 100644 if self._error[0]: errcode = self._error[0] - if errcode == 23 and code >= 200 and code < 299: +- if errcode == 23 and code >= 200 and code < 299: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url -+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e)) -+ err.url = errurl - +- ++ if errcode == 23 and 200 <= code <= 299: # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code -@@ -1244,23 +1459,23 @@ class PyCurlFileObject(): - raise KeyboardInterrupt + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME +- raise KeyboardInterrupt ++ raise getattr(self, '_cb_error', KeyboardInterrupt) elif errcode == 28: - err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) @@ -1008,12 +1029,10 @@ index e090e90..01218b0 100644 elif errcode == 42: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url -+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e)) -+ err.url = errurl # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside -@@ -1272,33 +1487,94 @@ class PyCurlFileObject(): +@@ -1272,33 +1493,94 @@ class PyCurlFileObject(): elif errcode == 58: msg = _("problem with the local client certificate") err = URLGrabError(14, msg) @@ -1040,8 +1059,9 @@ index e090e90..01218b0 100644 + err.url = errurl raise err - elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it +- elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) ++ elif str(e.args[1]) == '' and code and not 200 <= code <= 299: + if self.scheme in ['http', 'https']: + if self.http_code in responses: + resp = responses[self.http_code] @@ -1115,7 +1135,7 @@ index e090e90..01218b0 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1333,7 +1609,11 @@ class PyCurlFileObject(): +@@ -1333,7 +1615,11 @@ class PyCurlFileObject(): if self.opts.range: rt = self.opts.range @@ -1128,7 +1148,7 @@ index e090e90..01218b0 100644 if rt: header = range_tuple_to_header(rt) -@@ -1434,21 +1714,46 @@ class PyCurlFileObject(): +@@ -1434,21 +1720,46 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') @@ -1182,7 +1202,7 @@ index e090e90..01218b0 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1526,17 +1831,20 @@ class PyCurlFileObject(): +@@ -1526,17 +1837,20 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) @@ -1208,7 +1228,7 @@ index e090e90..01218b0 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1544,13 +1852,6 @@ class PyCurlFileObject(): +@@ -1544,13 +1858,6 @@ class PyCurlFileObject(): return True return False @@ -1222,7 +1242,7 @@ index e090e90..01218b0 100644 def read(self, amt=None): self._fill_buffer(amt) if amt is None: -@@ -1582,9 +1883,21 @@ class PyCurlFileObject(): +@@ -1582,9 +1889,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -1245,7 +1265,7 @@ index e090e90..01218b0 100644 ##################################################################### # DEPRECATED FUNCTIONS -@@ -1621,6 +1934,466 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, +@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, ##################################################################### @@ -1580,11 +1600,12 @@ index e090e90..01218b0 100644 + if key in removed: continue + + # estimate mirror speed -+ speed = _TH.estimate(key) ++ speed, fail = _TH.estimate(key) + speed /= 1 + host_con.get(key, 0) + + # order by: least failures, private flag, best speed -+ private = mirror.get('kwargs', {}).get('private', False) ++ # ignore 'private' flag if there were failures ++ private = not fail and mirror.get('kwargs', {}).get('private', False) + speed = -failed.get(key, 0), private, speed + if best is None or speed > best_speed: + best = mirror @@ -1701,19 +1722,19 @@ index e090e90..01218b0 100644 + + default_speed = default_grabber.opts.default_speed + try: speed, fail, ts = _TH.hosts[host] -+ except KeyError: return default_speed ++ except KeyError: return default_speed, 0 + + speed *= 2**-fail + k = 2**((ts - time.time()) / default_grabber.opts.half_life) + speed = k * speed + (1 - k) * default_speed -+ return speed ++ return speed, fail + +##################################################################### # TESTING def _main_test(): try: url, filename = sys.argv[1:3] diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py -index dad410b..b17be17 100644 +index dad410b..7975f1b 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -76,6 +76,9 @@ CUSTOMIZATION @@ -1726,7 +1747,7 @@ index dad410b..b17be17 100644 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. -@@ -87,10 +90,12 @@ CUSTOMIZATION +@@ -87,10 +90,14 @@ CUSTOMIZATION """ @@ -1737,10 +1758,12 @@ index dad410b..b17be17 100644 -from grabber import URLGrabError, CallbackObject, DEBUG +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 +from grabber import _run_callback, _do_raise ++from grabber import exception2msg ++from grabber import _TH def _(st): return st -@@ -126,7 +131,9 @@ class MirrorGroup: +@@ -126,7 +133,9 @@ class MirrorGroup: files) * if the local list is ever exhausted, a URLGrabError will be @@ -1751,7 +1774,7 @@ index dad410b..b17be17 100644 OPTIONS -@@ -153,7 +160,8 @@ class MirrorGroup: +@@ -153,7 +162,8 @@ class MirrorGroup: The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current @@ -1761,7 +1784,7 @@ index dad410b..b17be17 100644 This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) -@@ -184,6 +192,7 @@ class MirrorGroup: +@@ -184,6 +194,7 @@ class MirrorGroup: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > @@ -1769,7 +1792,25 @@ index dad410b..b17be17 100644 obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror -@@ -263,7 +272,8 @@ class MirrorGroup: +@@ -251,6 +262,17 @@ class MirrorGroup: + self.default_action = None + self._process_kwargs(kwargs) + ++ # use the same algorithm as parallel downloader to initially sort ++ # the mirror list (sort by speed, but prefer live private mirrors) ++ def estimate(m): ++ speed, fail = _TH.estimate(m['mirror']) ++ private = not fail and m.get('kwargs', {}).get('private', False) ++ return private, speed ++ ++ # update the initial order. since sorting is stable, the relative ++ # order of unknown (not used yet) hosts is retained. ++ self.mirrors.sort(key=estimate, reverse=True) ++ + # if these values are found in **kwargs passed to one of the urlXXX + # methods, they will be stripped before getting passed on to the + # grabber +@@ -263,7 +285,8 @@ class MirrorGroup: def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: @@ -1779,7 +1820,7 @@ index dad410b..b17be17 100644 parsed_mirrors.append(m) return parsed_mirrors -@@ -280,7 +290,9 @@ class MirrorGroup: +@@ -280,7 +303,9 @@ class MirrorGroup: # return a random mirror so that multiple mirrors get used # even without failures. if not gr.mirrors: @@ -1790,7 +1831,7 @@ index dad410b..b17be17 100644 return gr.mirrors[gr._next] def _failure(self, gr, cb_obj): -@@ -307,7 +319,9 @@ class MirrorGroup: +@@ -307,7 +332,9 @@ class MirrorGroup: a.update(action) action = a self.increment_mirror(gr, action) @@ -1801,7 +1842,7 @@ index dad410b..b17be17 100644 def increment_mirror(self, gr, action={}): """Tell the mirror object increment the mirror index -@@ -377,35 +391,50 @@ class MirrorGroup: +@@ -377,35 +404,50 @@ class MirrorGroup: gr.url = url gr.kw = dict(kw) self._load_gr(gr) @@ -1828,7 +1869,7 @@ index dad410b..b17be17 100644 + return func_ref( *(fullurl,), opts=opts, **kw ) except URLGrabError, e: if DEBUG: DEBUG.info('MIRROR: failed') -+ gr.errors.append((fullurl, str(e))) ++ gr.errors.append((fullurl, exception2msg(e))) obj = CallbackObject() obj.exception = e obj.mirror = mirrorchoice['mirror']