diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 8897bd2..a58dacb 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 28%{?dist} +Release: 29%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -44,6 +44,12 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Tue Jun 18 2013 Zdenek Pavlas - 3.9.1-29 +- Update to latest HEAD. +- Fix parsing of FTP 213 responses +- Switch to max_connections=1 after timing out. BZ 853432 +- max_connections=0 should imply the default limit. + * Fri May 17 2013 Zdenek Pavlas - 3.9.1-28 - Update to latest HEAD. - Add the "minrate" option. BZ 964298 diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index d53e4c3..8ae9c9a 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -314,7 +314,7 @@ index 3e5f3b7..8eeaeda 100644 return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..05ea9c3 100644 +index e090e90..6b409e3 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) @@ -920,7 +920,7 @@ index e090e90..05ea9c3 100644 if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length -@@ -1079,23 +1274,40 @@ class PyCurlFileObject(): +@@ -1079,32 +1274,62 @@ class PyCurlFileObject(): self.opts.progress_obj.update(self._amount_read) self._amount_read += len(buf) @@ -967,7 +967,10 @@ index e090e90..05ea9c3 100644 elif self.scheme in ['ftp']: s = None if buf.startswith('213 '): -@@ -1104,7 +1316,18 @@ class PyCurlFileObject(): + s = buf[3:].strip() ++ if len(s) >= 14: ++ s = None # ignore MDTM responses + elif buf.startswith('150 '): s = parse150(buf) if s: self.size = int(s) @@ -987,7 +990,7 @@ index e090e90..05ea9c3 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1336,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1338,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -998,7 +1001,7 @@ index e090e90..05ea9c3 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1127,6 +1352,9 @@ class PyCurlFileObject(): +@@ -1127,6 +1354,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts @@ -1008,7 +1011,7 @@ index e090e90..05ea9c3 100644 # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) -@@ -1136,11 +1364,21 @@ class PyCurlFileObject(): +@@ -1136,11 +1366,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -1031,7 +1034,7 @@ index e090e90..05ea9c3 100644 # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) -@@ -1148,9 +1386,11 @@ class PyCurlFileObject(): +@@ -1148,9 +1388,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -1046,7 +1049,7 @@ index e090e90..05ea9c3 100644 # ssl options if self.scheme == 'https': -@@ -1158,13 +1398,16 @@ class PyCurlFileObject(): +@@ -1158,13 +1400,16 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) @@ -1064,7 +1067,7 @@ index e090e90..05ea9c3 100644 if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: -@@ -1187,28 +1430,26 @@ class PyCurlFileObject(): +@@ -1187,28 +1432,26 @@ class PyCurlFileObject(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) @@ -1109,7 +1112,7 @@ index e090e90..05ea9c3 100644 # our url self.curl_obj.setopt(pycurl.URL, self.url) -@@ -1228,39 +1469,26 @@ class PyCurlFileObject(): +@@ -1228,39 +1471,26 @@ class PyCurlFileObject(): code = self.http_code errcode = e.args[0] @@ -1155,7 +1158,7 @@ index e090e90..05ea9c3 100644 # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside -@@ -1269,36 +1497,70 @@ class PyCurlFileObject(): +@@ -1269,36 +1499,70 @@ class PyCurlFileObject(): # figure out what aborted the pycurl process FIXME raise KeyboardInterrupt @@ -1251,7 +1254,7 @@ index e090e90..05ea9c3 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1333,7 +1595,11 @@ class PyCurlFileObject(): +@@ -1333,7 +1597,11 @@ class PyCurlFileObject(): if self.opts.range: rt = self.opts.range @@ -1264,7 +1267,7 @@ index e090e90..05ea9c3 100644 if rt: header = range_tuple_to_header(rt) -@@ -1434,21 +1700,46 @@ class PyCurlFileObject(): +@@ -1434,21 +1702,46 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') @@ -1318,7 +1321,7 @@ index e090e90..05ea9c3 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1526,17 +1817,20 @@ class PyCurlFileObject(): +@@ -1526,17 +1819,20 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) @@ -1344,7 +1347,7 @@ index e090e90..05ea9c3 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1544,13 +1838,6 @@ class PyCurlFileObject(): +@@ -1544,13 +1840,6 @@ class PyCurlFileObject(): return True return False @@ -1358,7 +1361,7 @@ index e090e90..05ea9c3 100644 def read(self, amt=None): self._fill_buffer(amt) if amt is None: -@@ -1582,9 +1869,21 @@ class PyCurlFileObject(): +@@ -1582,9 +1871,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -1381,7 +1384,7 @@ index e090e90..05ea9c3 100644 ##################################################################### # DEPRECATED FUNCTIONS -@@ -1621,6 +1920,482 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, +@@ -1621,6 +1922,489 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, ##################################################################### @@ -1614,6 +1617,7 @@ index e090e90..05ea9c3 100644 + + dl = _ExternalDownloaderPool() + host_con = {} # current host connection counts ++ single = set() # hosts in single connection mode + + def start(opts, tries): + opts.tries = tries @@ -1660,6 +1664,10 @@ index e090e90..05ea9c3 100644 + + if ug_err is None: + continue ++ if ug_err.errno == pycurl.E_OPERATION_TIMEOUTED: ++ # One possible cause is connection-limited server. ++ # Turn on the max_connections=1 override. BZ 853432 ++ single.add(key) + + retry = opts.retry or 0 + if opts.failure_callback: @@ -1749,7 +1757,7 @@ index e090e90..05ea9c3 100644 + + # update the current mirror and limit + key = best['mirror'] -+ limit = best.get('kwargs', {}).get('max_connections', 2) ++ limit = best.get('kwargs', {}).get('max_connections') or 2 + opts.async = key, limit + + # update URL and proxy @@ -1760,6 +1768,8 @@ index e090e90..05ea9c3 100644 + + # check host limit, then start + key, limit = opts.async ++ if key in single: ++ limit = 1 + while host_con.get(key, 0) >= limit: + perform() + if DEBUG: @@ -1865,20 +1875,21 @@ index e090e90..05ea9c3 100644 def _main_test(): try: url, filename = sys.argv[1:3] diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py -index dad410b..7975f1b 100644 +index dad410b..5d3aa34 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py -@@ -76,6 +76,9 @@ CUSTOMIZATION +@@ -76,6 +76,10 @@ CUSTOMIZATION 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. + kwarg 'max_connections' limits the number of concurrent -+ connections to this mirror. ++ connections to this mirror. When omitted or set to zero, ++ the default limit (2) will be used. + 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. -@@ -87,10 +90,14 @@ CUSTOMIZATION +@@ -87,10 +91,14 @@ CUSTOMIZATION """ @@ -1894,7 +1905,7 @@ index dad410b..7975f1b 100644 def _(st): return st -@@ -126,7 +133,9 @@ class MirrorGroup: +@@ -126,7 +134,9 @@ class MirrorGroup: files) * if the local list is ever exhausted, a URLGrabError will be @@ -1905,7 +1916,7 @@ index dad410b..7975f1b 100644 OPTIONS -@@ -153,7 +162,8 @@ class MirrorGroup: +@@ -153,7 +163,8 @@ class MirrorGroup: The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current @@ -1915,7 +1926,7 @@ index dad410b..7975f1b 100644 This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) -@@ -184,6 +194,7 @@ class MirrorGroup: +@@ -184,6 +195,7 @@ class MirrorGroup: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > @@ -1923,7 +1934,7 @@ index dad410b..7975f1b 100644 obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror -@@ -251,6 +262,17 @@ class MirrorGroup: +@@ -251,6 +263,17 @@ class MirrorGroup: self.default_action = None self._process_kwargs(kwargs) @@ -1941,7 +1952,7 @@ index dad410b..7975f1b 100644 # if these values are found in **kwargs passed to one of the urlXXX # methods, they will be stripped before getting passed on to the # grabber -@@ -263,7 +285,8 @@ class MirrorGroup: +@@ -263,7 +286,8 @@ class MirrorGroup: def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: @@ -1951,7 +1962,7 @@ index dad410b..7975f1b 100644 parsed_mirrors.append(m) return parsed_mirrors -@@ -280,7 +303,9 @@ class MirrorGroup: +@@ -280,7 +304,9 @@ class MirrorGroup: # return a random mirror so that multiple mirrors get used # even without failures. if not gr.mirrors: @@ -1962,7 +1973,7 @@ index dad410b..7975f1b 100644 return gr.mirrors[gr._next] def _failure(self, gr, cb_obj): -@@ -307,7 +332,9 @@ class MirrorGroup: +@@ -307,7 +333,9 @@ class MirrorGroup: a.update(action) action = a self.increment_mirror(gr, action) @@ -1973,7 +1984,7 @@ index dad410b..7975f1b 100644 def increment_mirror(self, gr, action={}): """Tell the mirror object increment the mirror index -@@ -377,35 +404,50 @@ class MirrorGroup: +@@ -377,35 +405,50 @@ class MirrorGroup: gr.url = url gr.kw = dict(kw) self._load_gr(gr)