diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 2c3c026..43e0a39 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,8 +3,9 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 1%{?dist} +Release: 2%{?dist} Source0: urlgrabber-%{version}.tar.gz +Patch1: urlgrabber-HEAD.patch License: LGPLv2+ Group: Development/Libraries @@ -22,7 +23,7 @@ authentication, proxies and more. %prep %setup -q -n urlgrabber-%{version} - +%patch1 -p1 %build python setup.py build @@ -42,6 +43,9 @@ rm -rf $RPM_BUILD_ROOT %{_bindir}/urlgrabber %changelog +* Fri Sep 25 2009 Seth Vidal - 3.9.1-2 +- stupid patch + * Fri Sep 25 2009 Seth Vidal - 3.9.1-1 - 3.9.1 diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 4d2905c..bfbcee5 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -1,385 +1,36 @@ diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index cf51dff..979b4c1 100644 +index e090e90..c4916d5 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py -@@ -402,11 +402,11 @@ import urllib - import urllib2 - import mimetools - import thread -+import types - from stat import * # S_* and ST_* - import pycurl - from ftplib import parse150 - from StringIO import StringIO --from tempfile import mkstemp - - ######################################################################## - # MODULE INITIALIZATION -@@ -467,6 +467,13 @@ except AttributeError: - TimeoutError = None - have_socket_timeout = False - -+try: -+ import signal -+ from signal import SIGPIPE, SIG_IGN -+ signal.signal(signal.SIGPIPE, signal.SIG_IGN) -+except ImportError: -+ pass -+ - ######################################################################## - # functions for debugging output. These functions are here because they - # are also part of the module initialization. -@@ -859,8 +866,15 @@ class URLGrabberOptions: - self.data = None - self.urlparser = URLParser() - self.quote = None -- self.ssl_ca_cert = None -- self.ssl_context = None -+ self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb -+ self.ssl_context = None # no-op in pycurl -+ self.ssl_verify_peer = True # check peer's cert for authenticityb -+ self.ssl_verify_host = True # make sure who they are and who the cert is for matches -+ self.ssl_key = None # client key -+ self.ssl_key_type = 'PEM' #(or DER) -+ self.ssl_cert = None # client cert -+ self.ssl_cert_type = 'PEM' # (or DER) -+ self.ssl_key_pass = None # password to access the key - - def __repr__(self): - return self.format() -@@ -1219,7 +1233,7 @@ class URLGrabberFileObject: - self.append = 0 - reget_length = 0 - rt = None -- if have_range and self.opts.reget and type(self.filename) == type(''): -+ if have_range and self.opts.reget and type(self.filename) in types.StringTypes: - # we have reget turned on and we're dumping to a file - try: - s = os.stat(self.filename) -@@ -1450,9 +1464,11 @@ class PyCurlFileObject(): - self.scheme = urlparse.urlsplit(self.url)[0] - self.filename = filename - self.append = False -+ self.reget_time = None - self.opts = opts -+ if self.opts.reget == 'check_timestamp': -+ raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." - self._complete = False -- self.reget_time = None - self._rbuf = '' - self._rbufsize = 1024*8 - self._ttime = time.time() -@@ -1474,39 +1490,45 @@ class PyCurlFileObject(): - raise AttributeError, name - - def _retrieve(self, buf): -- if not self._prog_running: -- if self.opts.progress_obj: -- size = self.size + self._reget_length -- self.opts.progress_obj.start(self._prog_reportname, -- urllib.unquote(self.url), -- self._prog_basename, -- size=size, -- text=self.opts.text) -- self._prog_running = True -- self.opts.progress_obj.update(self._amount_read) -- -- self._amount_read += len(buf) -- self.fo.write(buf) -- return len(buf) -- -+ try: -+ if not self._prog_running: -+ if self.opts.progress_obj: -+ size = self.size + self._reget_length -+ self.opts.progress_obj.start(self._prog_reportname, -+ urllib.unquote(self.url), -+ self._prog_basename, -+ size=size, -+ text=self.opts.text) -+ self._prog_running = True -+ self.opts.progress_obj.update(self._amount_read) -+ -+ self._amount_read += len(buf) -+ self.fo.write(buf) -+ return len(buf) -+ except KeyboardInterrupt: -+ return -1 -+ - def _hdr_retrieve(self, buf): -- self._hdr_dump += buf -- # we have to get the size before we do the progress obj start -- # but we can't do that w/o making it do 2 connects, which sucks -- # so we cheat and stuff it in here in the hdr_retrieve -- if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: -- length = buf.split(':')[1] -- self.size = int(length) -- elif self.scheme in ['ftp']: -- s = None -- if buf.startswith('213 '): -- s = buf[3:].strip() -- elif buf.startswith('150 '): -- s = parse150(buf) -- if s: -- self.size = s -- -- return len(buf) -+ try: -+ self._hdr_dump += buf -+ # we have to get the size before we do the progress obj start -+ # but we can't do that w/o making it do 2 connects, which sucks -+ # so we cheat and stuff it in here in the hdr_retrieve -+ if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: -+ length = buf.split(':')[1] -+ self.size = int(length) -+ elif self.scheme in ['ftp']: -+ s = None -+ if buf.startswith('213 '): -+ s = buf[3:].strip() -+ elif buf.startswith('150 '): -+ s = parse150(buf) -+ if s: -+ self.size = s -+ -+ return len(buf) -+ except KeyboardInterrupt: -+ return pycurl.READFUNC_ABORT - - def _return_hdr_obj(self): - if self._parsed_hdr: -@@ -1528,11 +1550,13 @@ class PyCurlFileObject(): - - - # defaults we're always going to set -- self.curl_obj.setopt(pycurl.NOPROGRESS, 0) -+ self.curl_obj.setopt(pycurl.NOPROGRESS, False) -+ self.curl_obj.setopt(pycurl.NOSIGNAL, True) - self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) - self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) - self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) -- self.curl_obj.setopt(pycurl.FAILONERROR, 1) -+ self.curl_obj.setopt(pycurl.FAILONERROR, True) -+ self.curl_obj.setopt(pycurl.OPT_FILETIME, True) - - if DEBUG: - self.curl_obj.setopt(pycurl.VERBOSE, True) -@@ -1540,19 +1564,32 @@ class PyCurlFileObject(): - self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) +@@ -1052,7 +1052,7 @@ class PyCurlFileObject(): + self._reget_length = 0 + self._prog_running = False + self._error = (None, None) +- self.size = None ++ self.size = 0 + self._do_open() - # maybe to be options later -- self.curl_obj.setopt(pycurl.FOLLOWLOCATION, 1) -+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) - self.curl_obj.setopt(pycurl.MAXREDIRS, 5) -- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, 30) - # timeouts - timeout = 300 - if opts.timeout: - timeout = int(opts.timeout) -- self.curl_obj.setopt(pycurl.TIMEOUT, timeout) -+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) -+ - # ssl options - if self.scheme == 'https': - if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs - self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) -+ self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) -+ self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) -+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) -+ if opts.ssl_key: -+ self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) -+ if opts.ssl_key_type: -+ self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) -+ if opts.ssl_cert: -+ self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) -+ if opts.ssl_cert_type: -+ self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) -+ if opts.ssl_key_pass: -+ self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) - - #headers: - if opts.http_headers and self.scheme in ('http', 'https'): -@@ -1578,19 +1615,21 @@ class PyCurlFileObject(): - if scheme not in ('ftp'): - continue - else: -+ if proxy == '_none_': proxy = "" - self.curl_obj.setopt(pycurl.PROXY, proxy) - elif self.scheme in ('http', 'https'): - if scheme not in ('http', 'https'): - continue - else: -+ if proxy == '_none_': proxy = "" - self.curl_obj.setopt(pycurl.PROXY, proxy) -- -- # username/password/auth settings -+ -+ # FIXME username/password/auth settings - - #posts - simple - expects the fields as they are - if opts.data: - self.curl_obj.setopt(pycurl.POST, True) -- self.curl_obj.setopt(pycurl.POSTFIELDS, opts.data) -+ self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) - - # our url - self.curl_obj.setopt(pycurl.URL, self.url) -@@ -1607,18 +1646,62 @@ class PyCurlFileObject(): - # to other URLGrabErrors from - # http://curl.haxx.se/libcurl/c/libcurl-errors.html - # this covers e.args[0] == 22 pretty well - which will be common -- if str(e.args[1]) == '': # fake it until you make it -+ code = self.http_code -+ if e.args[0] == 23 and code >= 200 and code < 299: -+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) -+ err.url = self.url -+ # this is probably wrong but ultimately this is what happens -+ # we have a legit http code and a pycurl 'writer failed' code -+ # which almost always means something aborted it from outside -+ # since we cannot know what it is -I'm banking on it being -+ # a ctrl-c. XXXX - if there's a way of going back two raises to -+ # figure out what aborted the pycurl process FIXME -+ raise KeyboardInterrupt -+ -+ elif e.args[0] == 28: -+ err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) -+ err.url = self.url -+ raise err -+ elif e.args[0] == 35: -+ msg = _("problem making ssl connection") -+ err = URLGrabError(14, msg) -+ err.url = self.url -+ raise err -+ -+ if e.args[0] == 42: -+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) -+ err.url = self.url -+ # this is probably wrong but ultimately this is what happens -+ # we have a legit http code and a pycurl 'writer failed' code -+ # which almost always means something aborted it from outside -+ # since we cannot know what it is -I'm banking on it being -+ # a ctrl-c. XXXX - if there's a way of going back two raises to -+ # figure out what aborted the pycurl process FIXME -+ raise KeyboardInterrupt -+ -+ elif e.args[0] == 58: -+ msg = _("problem with the local client certificate") -+ err = URLGrabError(14, msg) -+ err.url = self.url -+ raise err -+ -+ elif e.args[0] == 60: -+ msg = _("client cert cannot be verified or client cert incorrect") -+ err = URLGrabError(14, msg) -+ err.url = self.url -+ raise err -+ -+ elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) - else: -- msg = str(e.args[1]) -+ msg = 'PYCURL ERROR %s - "%s"' % (e.args[0], str(e.args[1])) -+ code = e.args[0] - err = URLGrabError(14, msg) -- err.code = self.http_code -+ err.code = code +@@ -1299,6 +1299,12 @@ class PyCurlFileObject(): + err.code = code err.exception = e raise err -- -+ ++ else: ++ if self._error[1]: ++ msg = self._error[1] ++ err = URLGRabError(14, msg) ++ err.url = self.url ++ raise err + def _do_open(self): -- self.append = False -- self.reget_time = None self.curl_obj = _curl_cache - self.curl_obj.reset() # reset all old settings away, just in case - # setup any ranges -@@ -1630,11 +1713,9 @@ class PyCurlFileObject(): - pass - - def _build_range(self): -- self.reget_time = None -- self.append = False - reget_length = 0 - rt = None -- if self.opts.reget and type(self.filename) == type(''): -+ if self.opts.reget and type(self.filename) in types.StringTypes: - # we have reget turned on and we're dumping to a file - try: - s = os.stat(self.filename) -@@ -1726,10 +1807,10 @@ class PyCurlFileObject(): - if self._complete: - return - -- if self.filename: -+ if self.filename is not None: - self._prog_reportname = str(self.filename) - self._prog_basename = os.path.basename(self.filename) -- -+ - if self.append: mode = 'ab' - else: mode = 'wb' - -@@ -1746,19 +1827,23 @@ class PyCurlFileObject(): - else: - self._prog_reportname = 'MEMORY' - self._prog_basename = 'MEMORY' -- fh, self._temp_name = mkstemp() +@@ -1536,7 +1542,8 @@ class PyCurlFileObject(): + if self.opts.size: # if we set an opts size use that, no matter what + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb +- if cur > max_size + max_size*.10: + - -- self.fo = open(self._temp_name, 'wb') -+ self.fo = StringIO() -+ # if this is to be a tempfile instead.... -+ # it just makes crap in the tempdir -+ #fh, self._temp_name = mkstemp() -+ #self.fo = open(self._temp_name, 'wb') - - - self._do_perform() - ++ if cur > int(float(max_size) * 1.10): -- # close it up -- self.fo.flush() -- self.fo.close() - - if self.filename: -+ # close it up -+ self.fo.flush() -+ self.fo.close() - # set the time - mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) - if mod_time != -1: -@@ -1766,7 +1851,8 @@ class PyCurlFileObject(): - # re open it - self.fo = open(self.filename, 'r') - else: -- self.fo = open(self._temp_name, 'r') -+ #self.fo = open(self._temp_name, 'r') -+ self.fo.seek(0) - - self._complete = True - -@@ -1834,10 +1920,20 @@ class PyCurlFileObject(): - return - - def _progress_update(self, download_total, downloaded, upload_total, uploaded): -- if self._prog_running: -- downloaded += self._reget_length -- self.opts.progress_obj.update(downloaded) -- -+ try: -+ if self._prog_running: -+ downloaded += self._reget_length -+ self.opts.progress_obj.update(downloaded) -+ except KeyboardInterrupt: -+ return -1 -+ -+ def _to_utf8(self, obj, errors='replace'): -+ '''convert 'unicode' to an encoded utf-8 byte string ''' -+ # stolen from yum.i18n -+ if isinstance(obj, unicode): -+ obj = obj.encode('utf-8', errors) -+ return obj -+ - def read(self, amt=None): - self._fill_buffer(amt) - if amt is None: + msg = _("Downloaded more than max size for %s: %s > %s") \ + % (self.url, cur, max_size)