diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index a7ac762..641fde8 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.0 -Release: 6%{?dist} +Release: 7%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch0: urlgrabber-HEAD.patch @@ -44,6 +44,9 @@ rm -rf $RPM_BUILD_ROOT %{_bindir}/urlgrabber %changelog +* Mon Aug 10 2009 Seth Vidal - 3.9.0-6 +- reget fixes, tmpfiles no longer made for urlopen() calls. + * Wed Aug 5 2009 Seth Vidal - 3.9.0-5 - apply complete patch to head fixes: timeouts, regets, improves exception raising diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 8c84095..e5d8c17 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -1,8 +1,8 @@ diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index cf51dff..3758799 100644 +index cf51dff..cea47e3 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py -@@ -402,6 +402,7 @@ import urllib +@@ -402,11 +402,11 @@ import urllib import urllib2 import mimetools import thread @@ -10,7 +10,26 @@ index cf51dff..3758799 100644 from stat import * # S_* and ST_* import pycurl from ftplib import parse150 -@@ -1219,7 +1220,7 @@ class URLGrabberFileObject: + from StringIO import StringIO +-from tempfile import mkstemp + + ######################################################################## + # MODULE INITIALIZATION +@@ -467,6 +467,13 @@ except AttributeError: + TimeoutError = None + have_socket_timeout = False + ++try: ++ import signal ++ from signal import SIGPIPE, SIG_IGN ++ signal.signal(signal.SIGPIPE, signal.SIG_IGN) ++except ImportError: ++ pass ++ + ######################################################################## + # functions for debugging output. These functions are here because they + # are also part of the module initialization. +@@ -1219,7 +1226,7 @@ class URLGrabberFileObject: self.append = 0 reget_length = 0 rt = None @@ -19,20 +38,104 @@ index cf51dff..3758799 100644 # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) -@@ -1450,6 +1451,7 @@ class PyCurlFileObject(): +@@ -1450,9 +1457,11 @@ class PyCurlFileObject(): self.scheme = urlparse.urlsplit(self.url)[0] self.filename = filename self.append = False + self.reget_time = None self.opts = opts ++ if self.opts.reget == 'check_timestamp': ++ raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." self._complete = False - self.reget_time = None -@@ -1528,11 +1530,12 @@ class PyCurlFileObject(): +- self.reget_time = None + self._rbuf = '' + self._rbufsize = 1024*8 + self._ttime = time.time() +@@ -1474,39 +1483,45 @@ class PyCurlFileObject(): + raise AttributeError, name + + def _retrieve(self, buf): +- if not self._prog_running: +- if self.opts.progress_obj: +- size = self.size + self._reget_length +- self.opts.progress_obj.start(self._prog_reportname, +- urllib.unquote(self.url), +- self._prog_basename, +- size=size, +- text=self.opts.text) +- self._prog_running = True +- self.opts.progress_obj.update(self._amount_read) +- +- self._amount_read += len(buf) +- self.fo.write(buf) +- return len(buf) +- ++ try: ++ if not self._prog_running: ++ if self.opts.progress_obj: ++ size = self.size + self._reget_length ++ self.opts.progress_obj.start(self._prog_reportname, ++ urllib.unquote(self.url), ++ self._prog_basename, ++ size=size, ++ text=self.opts.text) ++ self._prog_running = True ++ self.opts.progress_obj.update(self._amount_read) ++ ++ self._amount_read += len(buf) ++ self.fo.write(buf) ++ return len(buf) ++ except KeyboardInterrupt: ++ return pycurl.READFUNC_ABORT ++ + def _hdr_retrieve(self, buf): +- self._hdr_dump += buf +- # we have to get the size before we do the progress obj start +- # but we can't do that w/o making it do 2 connects, which sucks +- # so we cheat and stuff it in here in the hdr_retrieve +- if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: +- length = buf.split(':')[1] +- self.size = int(length) +- elif self.scheme in ['ftp']: +- s = None +- if buf.startswith('213 '): +- s = buf[3:].strip() +- elif buf.startswith('150 '): +- s = parse150(buf) +- if s: +- self.size = s +- +- return len(buf) ++ try: ++ self._hdr_dump += buf ++ # we have to get the size before we do the progress obj start ++ # but we can't do that w/o making it do 2 connects, which sucks ++ # so we cheat and stuff it in here in the hdr_retrieve ++ if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: ++ length = buf.split(':')[1] ++ self.size = int(length) ++ elif self.scheme in ['ftp']: ++ s = None ++ if buf.startswith('213 '): ++ s = buf[3:].strip() ++ elif buf.startswith('150 '): ++ s = parse150(buf) ++ if s: ++ self.size = s ++ ++ return len(buf) ++ except KeyboardInterrupt: ++ return pycurl.READFUNC_ABORT + + def _return_hdr_obj(self): + if self._parsed_hdr: +@@ -1528,11 +1543,13 @@ class PyCurlFileObject(): # defaults we're always going to set - self.curl_obj.setopt(pycurl.NOPROGRESS, 0) + self.curl_obj.setopt(pycurl.NOPROGRESS, False) ++ self.curl_obj.setopt(pycurl.NOSIGNAL, True) self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) @@ -42,7 +145,7 @@ index cf51dff..3758799 100644 if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) -@@ -1540,15 +1543,15 @@ class PyCurlFileObject(): +@@ -1540,15 +1557,15 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) # maybe to be options later @@ -61,15 +164,27 @@ index cf51dff..3758799 100644 # ssl options if self.scheme == 'https': if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs -@@ -1607,18 +1610,21 @@ class PyCurlFileObject(): +@@ -1607,18 +1624,33 @@ class PyCurlFileObject(): # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html # this covers e.args[0] == 22 pretty well - which will be common ++ code = self.http_code + if e.args[0] == 28: + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err -+ code = self.http_code ++ ++ elif e.args[0] == 23 and code >= 200 and code < 299: ++ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) ++ err.url = self.url ++ # this is probably wrong but ultimately this is what happens ++ # we have a legit http code and a pycurl 'writer failed' code ++ # which almost always means something aborted it from outside ++ # since we cannot know what it is -I'm banking on it being ++ # a ctrl-c. XXXX - if there's a way of going back two raises to ++ # figure out what aborted the pycurl process FIXME ++ raise KeyboardInterrupt ++ if str(e.args[1]) == '': # fake it until you make it msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) else: @@ -79,14 +194,15 @@ index cf51dff..3758799 100644 + err.code = code err.exception = e raise err - +- ++ def _do_open(self): - self.append = False - self.reget_time = None self.curl_obj = _curl_cache self.curl_obj.reset() # reset all old settings away, just in case # setup any ranges -@@ -1630,11 +1636,9 @@ class PyCurlFileObject(): +@@ -1630,11 +1662,9 @@ class PyCurlFileObject(): pass def _build_range(self): @@ -99,7 +215,7 @@ index cf51dff..3758799 100644 # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) -@@ -1729,7 +1733,7 @@ class PyCurlFileObject(): +@@ -1729,7 +1759,7 @@ class PyCurlFileObject(): if self.filename: self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) @@ -108,3 +224,50 @@ index cf51dff..3758799 100644 if self.append: mode = 'ab' else: mode = 'wb' +@@ -1746,19 +1776,23 @@ class PyCurlFileObject(): + else: + self._prog_reportname = 'MEMORY' + self._prog_basename = 'MEMORY' +- fh, self._temp_name = mkstemp() ++ + +- self.fo = open(self._temp_name, 'wb') ++ self.fo = StringIO() ++ # if this is to be a tempfile instead.... ++ # it just makes crap in the tempdir ++ #fh, self._temp_name = mkstemp() ++ #self.fo = open(self._temp_name, 'wb') + + + self._do_perform() + + +- # close it up +- self.fo.flush() +- self.fo.close() + + if self.filename: ++ # close it up ++ self.fo.flush() ++ self.fo.close() + # set the time + mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) + if mod_time != -1: +@@ -1766,7 +1800,8 @@ class PyCurlFileObject(): + # re open it + self.fo = open(self.filename, 'r') + else: +- self.fo = open(self._temp_name, 'r') ++ #self.fo = open(self._temp_name, 'r') ++ self.fo.seek(0) + + self._complete = True + +@@ -1838,6 +1873,7 @@ class PyCurlFileObject(): + downloaded += self._reget_length + self.opts.progress_obj.update(downloaded) + ++ + def read(self, amt=None): + self._fill_buffer(amt) + if amt is None: