diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 8a6f37e..6ee5dd6 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 22%{?dist} +Release: 23%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -44,6 +44,10 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Thu Dec 6 2012 Zdeněk Pavlas - 3.9.1-23 +- Update to latest HEAD. +- Improve URLGRABBER_DEBUG, add max_connections. BZ 853432 + * Thu Nov 1 2012 Zdeněk Pavlas - 3.9.1-22 - Update to latest HEAD - Get rid of "HTTP 200 OK" errors. BZ 871835. diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 55c3ba4..bc028de 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644 return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..74a692c 100644 +index e090e90..78c2e59 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) @@ -458,7 +458,26 @@ index e090e90..74a692c 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -527,6 +608,29 @@ def _(st): +@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None): + else: handler = logging.FileHandler(filename) + handler.setFormatter(formatter) + DBOBJ = logging.getLogger('urlgrabber') ++ DBOBJ.propagate = False + DBOBJ.addHandler(handler) + DBOBJ.setLevel(level) + except (KeyError, ImportError, ValueError): +@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None): + + def _log_package_state(): + if not DEBUG: return +- DEBUG.info('urlgrabber version = %s' % __version__) +- DEBUG.info('trans function "_" = %s' % _) ++ DEBUG.debug('urlgrabber version = %s' % __version__) ++ DEBUG.debug('trans function "_" = %s' % _) + + _init_default_logger() + _log_package_state() +@@ -527,6 +609,29 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## @@ -488,7 +507,7 @@ index e090e90..74a692c 100644 class URLGrabError(IOError): -@@ -662,6 +766,7 @@ class URLParser: +@@ -662,6 +767,7 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ @@ -496,7 +515,7 @@ index e090e90..74a692c 100644 quote = opts.quote if opts.prefix: -@@ -768,6 +873,41 @@ class URLGrabberOptions: +@@ -768,6 +874,41 @@ class URLGrabberOptions: else: # throttle is a float return self.bandwidth * self.throttle @@ -538,7 +557,7 @@ index e090e90..74a692c 100644 def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the -@@ -791,30 +931,37 @@ class URLGrabberOptions: +@@ -791,30 +932,37 @@ class URLGrabberOptions: provided here. """ self.progress_obj = None @@ -577,7 +596,7 @@ index e090e90..74a692c 100644 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb -@@ -827,6 +974,12 @@ class URLGrabberOptions: +@@ -827,6 +975,12 @@ class URLGrabberOptions: self.size = None # if we know how big the thing we're getting is going # to be. this is ultimately a MAXIMUM size for the file self.max_header_size = 2097152 #2mb seems reasonable for maximum header size @@ -590,7 +609,7 @@ index e090e90..74a692c 100644 def __repr__(self): return self.format() -@@ -846,7 +999,18 @@ class URLGrabberOptions: +@@ -846,7 +1000,18 @@ class URLGrabberOptions: s = s + indent + '}' return s @@ -610,7 +629,7 @@ index e090e90..74a692c 100644 """Provides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when -@@ -872,7 +1036,6 @@ class URLGrabber: +@@ -872,7 +1037,6 @@ class URLGrabber: # beware of infinite loops :) tries = tries + 1 exception = None @@ -618,7 +637,7 @@ index e090e90..74a692c 100644 callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) -@@ -883,54 +1046,62 @@ class URLGrabber: +@@ -883,54 +1047,62 @@ class URLGrabber: except URLGrabError, e: exception = e callback = opts.failure_callback @@ -688,7 +707,7 @@ index e090e90..74a692c 100644 if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently -@@ -950,41 +1121,51 @@ class URLGrabber: +@@ -950,41 +1122,51 @@ class URLGrabber: elif not opts.range: if not opts.checkfunc is None: @@ -755,7 +774,7 @@ index e090e90..74a692c 100644 if limit is not None: limit = limit + 1 -@@ -1000,12 +1181,8 @@ class URLGrabber: +@@ -1000,12 +1182,8 @@ class URLGrabber: else: s = fo.read(limit) if not opts.checkfunc is None: @@ -770,7 +789,7 @@ index e090e90..74a692c 100644 finally: fo.close() return s -@@ -1020,6 +1197,7 @@ class URLGrabber: +@@ -1020,6 +1198,7 @@ class URLGrabber: return s def _make_callback(self, callback_obj): @@ -778,7 +797,7 @@ index e090e90..74a692c 100644 if callable(callback_obj): return callback_obj, (), {} else: -@@ -1030,7 +1208,7 @@ class URLGrabber: +@@ -1030,7 +1209,7 @@ class URLGrabber: default_grabber = URLGrabber() @@ -787,7 +806,7 @@ index e090e90..74a692c 100644 def __init__(self, url, filename, opts): self.fo = None self._hdr_dump = '' -@@ -1052,10 +1230,13 @@ class PyCurlFileObject(): +@@ -1052,10 +1231,13 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -803,7 +822,7 @@ index e090e90..74a692c 100644 def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for -@@ -1067,6 +1248,12 @@ class PyCurlFileObject(): +@@ -1067,6 +1249,12 @@ class PyCurlFileObject(): def _retrieve(self, buf): try: @@ -816,7 +835,7 @@ index e090e90..74a692c 100644 if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length -@@ -1079,15 +1266,24 @@ class PyCurlFileObject(): +@@ -1079,15 +1267,24 @@ class PyCurlFileObject(): self.opts.progress_obj.update(self._amount_read) self._amount_read += len(buf) @@ -843,7 +862,7 @@ index e090e90..74a692c 100644 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start -@@ -1104,7 +1300,17 @@ class PyCurlFileObject(): +@@ -1104,7 +1301,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) @@ -857,12 +876,12 @@ index e090e90..74a692c 100644 + + if len(self._hdr_dump) != 0 and buf == '\r\n': + self._hdr_ended = True -+ if DEBUG: DEBUG.info('header ended:') ++ if DEBUG: DEBUG.debug('header ended:') + return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1319,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1320,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -873,7 +892,7 @@ index e090e90..74a692c 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1127,6 +1335,9 @@ class PyCurlFileObject(): +@@ -1127,6 +1336,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts @@ -883,13 +902,14 @@ index e090e90..74a692c 100644 # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) -@@ -1136,11 +1347,21 @@ class PyCurlFileObject(): +@@ -1136,11 +1348,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) - if DEBUG: +- if DEBUG: ++ if DEBUG and DEBUG.level <= 10: self.curl_obj.setopt(pycurl.VERBOSE, True) if opts.user_agent: self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) @@ -905,7 +925,7 @@ index e090e90..74a692c 100644 # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) -@@ -1148,9 +1369,11 @@ class PyCurlFileObject(): +@@ -1148,9 +1370,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -920,7 +940,7 @@ index e090e90..74a692c 100644 # ssl options if self.scheme == 'https': -@@ -1158,13 +1381,16 @@ class PyCurlFileObject(): +@@ -1158,13 +1382,16 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) @@ -938,7 +958,7 @@ index e090e90..74a692c 100644 if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: -@@ -1187,28 +1413,26 @@ class PyCurlFileObject(): +@@ -1187,28 +1414,26 @@ class PyCurlFileObject(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) @@ -983,7 +1003,7 @@ index e090e90..74a692c 100644 # our url self.curl_obj.setopt(pycurl.URL, self.url) -@@ -1228,39 +1452,36 @@ class PyCurlFileObject(): +@@ -1228,39 +1453,36 @@ class PyCurlFileObject(): code = self.http_code errcode = e.args[0] @@ -1032,7 +1052,7 @@ index e090e90..74a692c 100644 # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside -@@ -1272,33 +1493,94 @@ class PyCurlFileObject(): +@@ -1272,33 +1494,94 @@ class PyCurlFileObject(): elif errcode == 58: msg = _("problem with the local client certificate") err = URLGrabError(14, msg) @@ -1135,7 +1155,7 @@ index e090e90..74a692c 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1333,7 +1615,11 @@ class PyCurlFileObject(): +@@ -1333,7 +1616,11 @@ class PyCurlFileObject(): if self.opts.range: rt = self.opts.range @@ -1148,7 +1168,7 @@ index e090e90..74a692c 100644 if rt: header = range_tuple_to_header(rt) -@@ -1434,21 +1720,46 @@ class PyCurlFileObject(): +@@ -1434,21 +1721,46 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') @@ -1202,7 +1222,7 @@ index e090e90..74a692c 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1526,17 +1837,20 @@ class PyCurlFileObject(): +@@ -1526,17 +1838,20 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) @@ -1228,7 +1248,7 @@ index e090e90..74a692c 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1544,13 +1858,6 @@ class PyCurlFileObject(): +@@ -1544,13 +1859,6 @@ class PyCurlFileObject(): return True return False @@ -1242,7 +1262,7 @@ index e090e90..74a692c 100644 def read(self, amt=None): self._fill_buffer(amt) if amt is None: -@@ -1582,9 +1889,21 @@ class PyCurlFileObject(): +@@ -1582,9 +1890,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -1265,7 +1285,7 @@ index e090e90..74a692c 100644 ##################################################################### # DEPRECATED FUNCTIONS -@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, +@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, ##################################################################### @@ -1498,17 +1518,23 @@ index e090e90..74a692c 100644 + host_con = {} # current host connection counts + + def start(opts, tries): ++ opts.tries = tries ++ try: ++ dl.start(opts) ++ except OSError, e: ++ # can't spawn downloader, give up immediately ++ opts.exception = URLGrabError(5, exception2msg(e)) ++ _run_callback(opts.failfunc, opts) ++ return ++ + key, limit = opts.async + host_con[key] = host_con.get(key, 0) + 1 -+ opts.tries = tries + if opts.progress_obj: + if opts.multi_progress_obj: + opts._progress = opts.multi_progress_obj.newMeter() + opts._progress.start(text=opts.text) + else: + opts._progress = time.time() # no updates -+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) -+ dl.start(opts) + + def perform(): + for opts, size, ug_err in dl.perform(): @@ -1588,6 +1614,8 @@ index e090e90..74a692c 100644 + # check global limit + while len(dl.running) >= default_grabber.opts.max_connections: + perform() ++ if DEBUG: ++ DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections) + + if opts.mirror_group: + mg, errors, failed, removed = opts.mirror_group @@ -1636,6 +1664,9 @@ index e090e90..74a692c 100644 + key, limit = opts.async + while host_con.get(key, 0) >= limit: + perform() ++ if DEBUG: ++ DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit) ++ + start(opts, 1) + except IOError, e: + if e.errno != 4: raise