Tree - rpms/python-urlgrabber - src.fedoraproject.org

rpms / python-urlgrabber

Overview Files Commits Branches Forks Releases
Monitoring status:

Bugzilla Assignee:

Fedora:: ngompa
EPEL:: ngompa
Files

Commit: a16417493c8f620f0bd51891be1b2c1124acb290
Blob Blame History Raw
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..09cd896 100644
--- a/scripts/urlgrabber
+++ b/scripts/urlgrabber
@@ -115,6 +115,7 @@ options:
                     including quotes in the case of strings.
                     e.g.  --user_agent='"foobar/2.0"'
 
+  --output FILE
   -o FILE           write output to FILE, otherwise the basename of the
                     url will be used
   -O                print the names of saved files to STDOUT
@@ -170,12 +171,17 @@ class client_options:
         return ug_options, ug_defaults
 
     def process_command_line(self):
-        short_options = 'vd:hoOpD'
+        short_options = 'vd:ho:OpD'
         long_options = ['profile', 'repeat=', 'verbose=',
-                        'debug=', 'help', 'progress']
+                        'debug=', 'help', 'progress', 'output=']
         ug_long = [ o + '=' for o in self.ug_options ]
-        optlist, args = getopt.getopt(sys.argv[1:], short_options,
-                                      long_options + ug_long)
+        try:
+            optlist, args = getopt.getopt(sys.argv[1:], short_options,
+                                          long_options + ug_long)
+        except getopt.GetoptError, e:
+            print >>sys.stderr, "Error:", e
+            self.help([], ret=1)
+
         self.verbose = 0
         self.debug = None
         self.outputfile = None
@@ -193,6 +199,7 @@ class client_options:
             if o == '--verbose': self.verbose = v
             if o == '-v':        self.verbose += 1
             if o == '-o':        self.outputfile = v
+            if o == '--output':  self.outputfile = v
             if o == '-p' or o == '--progress': self.progress = 1
             if o == '-d' or o == '--debug': self.debug = v
             if o == '--profile': self.profile = 1
@@ -222,7 +229,7 @@ class client_options:
             print "ERROR: cannot use -o when grabbing multiple files"
             sys.exit(1)
 
-    def help(self, args):
+    def help(self, args, ret=0):
         if not args:
             print MAINHELP
         else:
@@ -234,7 +241,7 @@ class client_options:
                     self.help_ug_option(a)
                 else:
                     print 'ERROR: no help on command "%s"' % a
-        sys.exit(0)
+        sys.exit(ret)
 
     def help_doc(self):
         print __doc__
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..5fb43f9 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -1,6 +1,6 @@
 from munittest import *
 
-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+base_http = 'http://urlgrabber.baseurl.org/test/'
 base_ftp  = 'ftp://localhost/test/'
 
 # set to a proftp server only. we're working around a couple of
diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
index 3e5f3b7..8eeaeda 100644
--- a/urlgrabber/byterange.py
+++ b/urlgrabber/byterange.py
@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
     
     def http_error_416(self, req, fp, code, msg, hdrs):
         # HTTP's Range Not Satisfiable error
-        raise RangeError('Requested Range Not Satisfiable')
+        raise RangeError(9, 'Requested Range Not Satisfiable')
 
 class HTTPSRangeHandler(HTTPRangeHandler):
     """ Range Header support for HTTPS. """
@@ -208,7 +208,7 @@ class RangeableFileObject:
                 bufsize = offset - pos
             buf = self.fo.read(bufsize)
             if len(buf) != bufsize:
-                raise RangeError('Requested Range Not Satisfiable')
+                raise RangeError(9, 'Requested Range Not Satisfiable')
             pos+= bufsize
 
 class FileRangeHandler(urllib2.FileHandler):
@@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler):
             (fb,lb) = brange
             if lb == '': lb = size
             if fb < 0 or fb > size or lb > size:
-                raise RangeError('Requested Range Not Satisfiable')
+                raise RangeError(9, 'Requested Range Not Satisfiable')
             size = (lb - fb)
             fo = RangeableFileObject(fo, (fb,lb))
         headers = mimetools.Message(StringIO(
@@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler):
                 (fb,lb) = range_tup
                 if lb == '': 
                     if retrlen is None or retrlen == 0:
-                        raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
+                        raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
                     lb = retrlen
                     retrlen = lb - fb
                     if retrlen < 0:
                         # beginning of range is larger than file
-                        raise RangeError('Requested Range Not Satisfiable')
+                        raise RangeError(9, 'Requested Range Not Satisfiable')
                 else:
                     retrlen = lb - fb
                     fp = RangeableFileObject(fp, (0,retrlen))
@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
     # check if range is over the entire file
     if (fb,lb) == (0,''): return None
     # check that the range is valid
-    if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
+    if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
     return (fb,lb)
 
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..38ae1f7 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
   progress_obj = None
 
     a class instance that supports the following methods:
-      po.start(filename, url, basename, length, text)
+      po.start(filename, url, basename, size, now, text)
       # length will be None if unknown
       po.update(read) # read == bytes read so far
       po.end()
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
     (which can be set on default_grabber.throttle) is used. See
     BANDWIDTH THROTTLING for more information.
 
-  timeout = None
+  timeout = 300
 
-    a positive float expressing the number of seconds to wait for socket
-    operations. If the value is None or 0.0, socket operations will block
-    forever. Setting this option causes urlgrabber to call the settimeout
-    method on the Socket object used for the request. See the Python
-    documentation on settimeout for more information.
-    http://www.python.org/doc/current/lib/socket-objects.html
+    a positive integer expressing the number of seconds to wait before
+    timing out attempts to connect to a server. If the value is None
+    or 0, connection attempts will not time out. The timeout is passed
+    to the underlying pycurl object as its CONNECTTIMEOUT option, see
+    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
 
   bandwidth = 0
 
@@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs)
     note that proxy authentication information may be provided using
     normal URL constructs:
       proxies={ 'http' : 'http://user:host@foo:3128' }
-    Lastly, if proxies is None, the default environment settings will
-    be used.
+
+  libproxy = False
+
+    Use the libproxy module (if installed) to find proxies.
+    The libproxy code is only used if the proxies dictionary
+    does not provide any proxies.
 
   prefix = None
 
@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs)
     control, you should probably subclass URLParser and pass it in via
     the 'urlparser' option.
 
+  username = None
+    username to use for simple http auth - is automatically quoted for special characters
+
+  password = None
+    password to use for simple http auth - is automatically quoted for special characters
+
   ssl_ca_cert = None
 
     this option can be used if M2Crypto is available and will be
@@ -211,43 +221,48 @@ GENERAL ARGUMENTS (kwargs)
     No-op when using the curl backend (default)
    
 
-  self.ssl_verify_peer = True 
+  ssl_verify_peer = True
 
     Check the server's certificate to make sure it is valid with what our CA validates
   
-  self.ssl_verify_host = True
+  ssl_verify_host = True
 
     Check the server's hostname to make sure it matches the certificate DN
 
-  self.ssl_key = None
+  ssl_key = None
 
     Path to the key the client should use to connect/authenticate with
 
-  self.ssl_key_type = 'PEM' 
+  ssl_key_type = 'PEM'
 
     PEM or DER - format of key
      
-  self.ssl_cert = None
+  ssl_cert = None
 
     Path to the ssl certificate the client should use to to authenticate with
 
-  self.ssl_cert_type = 'PEM' 
+  ssl_cert_type = 'PEM'
 
     PEM or DER - format of certificate
     
-  self.ssl_key_pass = None 
+  ssl_key_pass = None
 
     password to access the ssl_key
     
-  self.size = None
+  size = None
 
     size (in bytes) or Maximum size of the thing being downloaded. 
     This is mostly to keep us from exploding with an endless datastream
   
-  self.max_header_size = 2097152 
+  max_header_size = 2097152
 
     Maximum size (in bytes) of the headers.
     
+  ip_resolve = 'whatever'
+
+    What type of name to IP resolving to use, default is to do both IPV4 and
+    IPV6.
+
 
 RETRY RELATED ARGUMENTS
 
@@ -420,6 +435,7 @@ import time
 import string
 import urllib
 import urllib2
+from httplib import responses
 import mimetools
 import thread
 import types
@@ -431,6 +447,14 @@ from httplib import HTTPException
 import socket
 from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
 
+try:
+    import xattr
+    if not hasattr(xattr, 'set'):
+        xattr = None # This is a "newer" API.
+except ImportError:
+    xattr = None
+
+
 ########################################################################
 #                     MODULE INITIALIZATION
 ########################################################################
@@ -439,6 +463,12 @@ try:
 except:
     __version__ = '???'
 
+try:
+    # this part isn't going to do much - need to talk to gettext
+    from i18n import _
+except ImportError, msg:
+    def _(st): return st
+    
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
@@ -527,6 +557,22 @@ def _(st):
 #                 END MODULE INITIALIZATION
 ########################################################################
 
+########################################################################
+#                 UTILITY FUNCTIONS
+########################################################################
+
+# These functions are meant to be utilities for the urlgrabber library to use.
+
+def _to_utf8(obj, errors='replace'):
+    '''convert 'unicode' to an encoded utf-8 byte string '''
+    # stolen from yum.i18n
+    if isinstance(obj, unicode):
+        obj = obj.encode('utf-8', errors)
+    return obj
+
+########################################################################
+#                 END UTILITY FUNCTIONS
+########################################################################
 
 
 class URLGrabError(IOError):
@@ -662,6 +708,7 @@ class URLParser:
           opts.quote = 0     --> do not quote it
           opts.quote = None  --> guess
         """
+        url = _to_utf8(url)
         quote = opts.quote
         
         if opts.prefix:
@@ -768,6 +815,41 @@ class URLGrabberOptions:
         else: # throttle is a float
             return self.bandwidth * self.throttle
         
+    def find_proxy(self, url, scheme):
+        """Find the proxy to use for this URL.
+        Use the proxies dictionary first, then libproxy.
+        """
+        self.proxy = None
+        if scheme not in ('ftp', 'http', 'https'):
+            return
+
+        if self.proxies:
+            proxy = self.proxies.get(scheme)
+            if proxy is None:
+                if scheme == 'http':
+                    proxy = self.proxies.get('https')
+                elif scheme == 'https':
+                    proxy = self.proxies.get('http')
+            if proxy == '_none_':
+                proxy = ''
+            self.proxy = proxy
+            return
+
+        if self.libproxy:
+            global _libproxy_cache
+            if _libproxy_cache is None:
+                try:
+                    import libproxy
+                    _libproxy_cache = libproxy.ProxyFactory()
+                except:
+                    _libproxy_cache = False
+            if _libproxy_cache:
+                for proxy in _libproxy_cache.getProxies(url):
+                    if proxy.startswith('http://'):
+                        if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
+                        self.proxy = proxy
+                        break
+
     def derive(self, **kwargs):
         """Create a derived URLGrabberOptions instance.
         This method creates a new instance and overrides the
@@ -800,21 +882,25 @@ class URLGrabberOptions:
         self.close_connection = 0
         self.range = None
         self.user_agent = 'urlgrabber/%s' % __version__
+        self.ip_resolve = None
         self.keepalive = 1
         self.proxies = None
+        self.libproxy = False
         self.reget = None
         self.failure_callback = None
         self.interrupt_callback = None
         self.prefix = None
         self.opener = None
         self.cache_openers = True
-        self.timeout = None
+        self.timeout = 300
         self.text = None
         self.http_headers = None
         self.ftp_headers = None
         self.data = None
         self.urlparser = URLParser()
         self.quote = None
+        self.username = None
+        self.password = None
         self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
         self.ssl_context = None # no-op in pycurl
         self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -846,7 +932,7 @@ class URLGrabberOptions:
         s = s + indent + '}'
         return s
 
-class URLGrabber:
+class URLGrabber(object):
     """Provides easy opening of URLs with a variety of options.
     
     All options are specified as kwargs. Options may be specified when
@@ -912,9 +998,11 @@ class URLGrabber:
         returned that supports them. The file object can be treated 
         like any other file object.
         """
+        url = _to_utf8(url)
         opts = self.opts.derive(**kwargs)
         if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
+        opts.find_proxy(url, parts[0])
         def retryfunc(opts, url):
             return PyCurlFileObject(url, filename=None, opts=opts)
         return self._retry(opts, retryfunc, url)
@@ -925,12 +1013,17 @@ class URLGrabber:
         urlgrab returns the filename of the local file, which may be 
         different from the passed-in filename if copy_local == 0.
         """
+        url = _to_utf8(url)
         opts = self.opts.derive(**kwargs)
         if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
         (scheme, host, path, parm, query, frag) = parts
+        opts.find_proxy(url, scheme)
         if filename is None:
             filename = os.path.basename( urllib.unquote(path) )
+            if not filename:
+                # This is better than nothing.
+                filename = 'index.html'
         if scheme == 'file' and not opts.copy_local:
             # just return the name of the local file - don't make a 
             # copy currently
@@ -982,9 +1075,11 @@ class URLGrabber:
         "I want the first N bytes" but rather 'read the whole file 
         into memory, but don't use too much'
         """
+        url = _to_utf8(url)
         opts = self.opts.derive(**kwargs)
         if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
+        opts.find_proxy(url, parts[0])
         if limit is not None:
             limit = limit + 1
             
@@ -1030,7 +1125,7 @@ class URLGrabber:
 default_grabber = URLGrabber()
 
 
-class PyCurlFileObject():
+class PyCurlFileObject(object):
     def __init__(self, url, filename, opts):
         self.fo = None
         self._hdr_dump = ''
@@ -1052,10 +1147,11 @@ class PyCurlFileObject():
         self._reget_length = 0
         self._prog_running = False
         self._error = (None, None)
-        self.size = None
+        self.size = 0
+        self._hdr_ended = False
         self._do_open()
         
-        
+
     def __getattr__(self, name):
         """This effectively allows us to wrap at the instance level.
         Any attribute not found in _this_ object will be searched for
@@ -1085,9 +1181,14 @@ class PyCurlFileObject():
             return -1
             
     def _hdr_retrieve(self, buf):
+        if self._hdr_ended:
+            self._hdr_dump = ''
+            self.size = 0
+            self._hdr_ended = False
+
         if self._over_max_size(cur=len(self._hdr_dump), 
                                max_size=self.opts.max_header_size):
-            return -1            
+            return -1
         try:
             self._hdr_dump += buf
             # we have to get the size before we do the progress obj start
@@ -1104,7 +1205,17 @@ class PyCurlFileObject():
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
-            
+                    
+            if buf.lower().find('location') != -1:
+                location = ':'.join(buf.split(':')[1:])
+                location = location.strip()
+                self.scheme = urlparse.urlsplit(location)[0]
+                self.url = location
+                
+            if len(self._hdr_dump) != 0 and buf == '\r\n':
+                self._hdr_ended = True
+                if DEBUG: DEBUG.info('header ended:')
+                
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
@@ -1113,8 +1224,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
+        statusend += 1 # ridiculous as it may seem.
         hdrfp = StringIO()
         hdrfp.write(self._hdr_dump[statusend:])
+        hdrfp.seek(0)
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
     
@@ -1127,6 +1240,9 @@ class PyCurlFileObject():
         if not opts:
             opts = self.opts
 
+        # keepalives
+        if not opts.keepalive:
+            self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
 
         # defaults we're always going to set
         self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1252,21 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
         
         if DEBUG:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
         if opts.user_agent:
             self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
+        if opts.ip_resolve:
+            # Default is: IPRESOLVE_WHATEVER
+            ipr = opts.ip_resolve.lower()
+            if ipr == 'whatever': # Do we need this?
+                self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER)
+            if ipr == 'ipv4':
+                self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
+            if ipr == 'ipv6':
+                self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
         
         # maybe to be options later
         self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1274,11 @@ class PyCurlFileObject():
         
         # timeouts
         timeout = 300
-        if opts.timeout:
-            timeout = int(opts.timeout)
-            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+        if hasattr(opts, 'timeout'):
+            timeout = int(opts.timeout or 0)
+        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
 
         # ssl options
         if self.scheme == 'https':
@@ -1158,13 +1286,16 @@ class PyCurlFileObject():
                 self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
                 self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
             self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
-            self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+            if opts.ssl_verify_host: # 1 is meaningless to curl
+                self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
             if opts.ssl_key:
                 self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
             if opts.ssl_key_type:
                 self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
             if opts.ssl_cert:
                 self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+                # if we have a client side cert - turn off reuse b/c nss is odd
+                self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
             if opts.ssl_cert_type:                
                 self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
             if opts.ssl_key_pass:
@@ -1187,28 +1318,24 @@ class PyCurlFileObject():
         if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
             self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
             
-        # proxy settings
-        if opts.proxies:
-            for (scheme, proxy) in opts.proxies.items():
-                if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
-                    if scheme not in ('ftp'):
-                        continue
-                    else:
-                        if proxy == '_none_': proxy = ""
-                        self.curl_obj.setopt(pycurl.PROXY, proxy)
-                elif self.scheme in ('http', 'https'):
-                    if scheme not in ('http', 'https'):
-                        continue
-                    else:
-                        if proxy == '_none_': proxy = ""
-                        self.curl_obj.setopt(pycurl.PROXY, proxy)
-            
-        # FIXME username/password/auth settings
+        # proxy
+        if opts.proxy is not None:
+            self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
+            self.curl_obj.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
+
+        if opts.username and opts.password:
+            if self.scheme in ('http', 'https'):
+                self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
+
+            if opts.username and opts.password:
+                # apparently when applying them as curlopts they do not require quoting of any kind
+                userpwd = '%s:%s' % (opts.username, opts.password)
+                self.curl_obj.setopt(pycurl.USERPWD, userpwd)
 
         #posts - simple - expects the fields as they are
         if opts.data:
             self.curl_obj.setopt(pycurl.POST, True)
-            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
+            self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
             
         # our url
         self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1219,8 +1346,14 @@ class PyCurlFileObject():
             return
         
         try:
+            e = None
             self.curl_obj.perform()
-        except pycurl.error, e:
+        except pycurl.error, e: pass
+        self._do_perform_exc(e)
+
+    def _do_perform_exc(self, e):
+        # handle pycurl exception 'e'
+        if e:
             # XXX - break some of these out a bit more clearly
             # to other URLGrabErrors from 
             # http://curl.haxx.se/libcurl/c/libcurl-errors.html
@@ -1228,12 +1361,14 @@ class PyCurlFileObject():
             
             code = self.http_code
             errcode = e.args[0]
+            errurl = urllib.unquote(self.url)
+            
             if self._error[0]:
                 errcode = self._error[0]
                 
             if errcode == 23 and code >= 200 and code < 299:
-                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
-                err.url = self.url
+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
+                err.url = errurl
                 
                 # this is probably wrong but ultimately this is what happens
                 # we have a legit http code and a pycurl 'writer failed' code
@@ -1244,23 +1379,23 @@ class PyCurlFileObject():
                 raise KeyboardInterrupt
             
             elif errcode == 28:
-                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
-                err.url = self.url
+                err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
+                err.url = errurl
                 raise err
             elif errcode == 35:
                 msg = _("problem making ssl connection")
                 err = URLGrabError(14, msg)
-                err.url = self.url
+                err.url = errurl
                 raise err
             elif errcode == 37:
-                msg = _("Could not open/read %s") % (self.url)
+                msg = _("Could not open/read %s") % (errurl)
                 err = URLGrabError(14, msg)
-                err.url = self.url
+                err.url = errurl
                 raise err
                 
             elif errcode == 42:
-                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
-                err.url = self.url
+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
+                err.url = errurl
                 # this is probably wrong but ultimately this is what happens
                 # we have a legit http code and a pycurl 'writer failed' code
                 # which almost always means something aborted it from outside
@@ -1272,33 +1407,94 @@ class PyCurlFileObject():
             elif errcode == 58:
                 msg = _("problem with the local client certificate")
                 err = URLGrabError(14, msg)
-                err.url = self.url
+                err.url = errurl
                 raise err
 
             elif errcode == 60:
-                msg = _("client cert cannot be verified or client cert incorrect")
+                msg = _("Peer cert cannot be verified or peer cert invalid")
                 err = URLGrabError(14, msg)
-                err.url = self.url
+                err.url = errurl
                 raise err
             
             elif errcode == 63:
                 if self._error[1]:
                     msg = self._error[1]
                 else:
-                    msg = _("Max download size exceeded on %s") % (self.url)
+                    msg = _("Max download size exceeded on %s") % ()
                 err = URLGrabError(14, msg)
-                err.url = self.url
+                err.url = errurl
                 raise err
                     
             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
-                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+                if self.scheme in ['http', 'https']:
+                    if self.http_code in responses:
+                        resp = responses[self.http_code]
+                        msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl)
+                    else:
+                        msg = 'HTTP Error %s : %s ' % (self.http_code, errurl)
+                elif self.scheme in ['ftp']:
+                    msg = 'FTP Error %s : %s ' % (self.http_code, errurl)
+                else:
+                    msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme)
             else:
-                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
+                pyerr2str = { 5 : _("Couldn't resolve proxy"),
+                              6 : _("Couldn't resolve host"),
+                              7 : _("Couldn't connect"),
+                              8 : _("Bad reply to FTP server"),
+                              9 : _("Access denied"),
+                             11 : _("Bad reply to FTP pass"),
+                             13 : _("Bad reply to FTP pasv"),
+                             14 : _("Bad reply to FTP 227"),
+                             15 : _("Couldn't get FTP host"),
+                             17 : _("Couldn't set FTP type"),
+                             18 : _("Partial file"),
+                             19 : _("FTP RETR command failed"),
+                             22 : _("HTTP returned error"),
+                             23 : _("Write error"),
+                             25 : _("Upload failed"),
+                             26 : _("Read error"),
+                             27 : _("Out of Memory"),
+                             28 : _("Operation timed out"),
+                             30 : _("FTP PORT command failed"),
+                             31 : _("FTP REST command failed"),
+                             33 : _("Range failed"),
+                             34 : _("HTTP POST failed"),
+                             35 : _("SSL CONNECT failed"),
+                             36 : _("Couldn't resume download"),
+                             37 : _("Couldn't read file"),
+                             42 : _("Aborted by callback"),
+                             47 : _("Too many redirects"),
+                             51 : _("Peer certificate failed verification"),
+                             52 : _("Got nothing: SSL certificate expired?"),
+                             53 : _("SSL engine not found"),
+                             54 : _("SSL engine set failed"),
+                             55 : _("Network error send()"),
+                             56 : _("Network error recv()"),
+                             58 : _("Local certificate failed"),
+                             59 : _("SSL set cipher failed"),
+                             60 : _("Local CA certificate failed"),
+                             61 : _("HTTP bad transfer encoding"),
+                             63 : _("Maximum file size exceeded"),
+                             64 : _("FTP SSL failed"),
+                             67 : _("Authentication failure"),
+                             70 : _("Out of disk space on server"),
+                             73 : _("Remove file exists"),
+                              }
+                errstr = str(e.args[1])
+                if not errstr:
+                    errstr = pyerr2str.get(errcode, '<Unknown>')
+                msg = 'curl#%s - "%s"' % (errcode, errstr)
                 code = errcode
             err = URLGrabError(14, msg)
             err.code = code
             err.exception = e
             raise err
+        else:
+            if self._error[1]:
+                msg = self._error[1]
+                err = URLGrabError(14, msg)
+                err.url = urllib.unquote(self.url)
+                raise err
 
     def _do_open(self):
         self.curl_obj = _curl_cache
@@ -1333,7 +1529,11 @@ class PyCurlFileObject():
                 
         if self.opts.range:
             rt = self.opts.range
-            if rt[0]: rt = (rt[0] + reget_length, rt[1])
+            
+            if rt[0] is None:
+                rt = (0, rt[1])
+            rt = (rt[0] + reget_length, rt[1])
+            
 
         if rt:
             header = range_tuple_to_header(rt)
@@ -1407,22 +1607,7 @@ class PyCurlFileObject():
         _was_filename = False
         if type(self.filename) in types.StringTypes and self.filename:
             _was_filename = True
-            self._prog_reportname = str(self.filename)
-            self._prog_basename = os.path.basename(self.filename)
-            
-            if self.append: mode = 'ab'
-            else: mode = 'wb'
-
-            if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
-                                 (self.filename, mode))
-            try:
-                self.fo = open(self.filename, mode)
-            except IOError, e:
-                err = URLGrabError(16, _(\
-                  'error opening local file from %s, IOError: %s') % (self.url, e))
-                err.url = self.url
-                raise err
-
+            self._do_open_fo()
         else:
             self._prog_reportname = 'MEMORY'
             self._prog_basename = 'MEMORY'
@@ -1434,27 +1619,71 @@ class PyCurlFileObject():
             #fh, self._temp_name = mkstemp()
             #self.fo = open(self._temp_name, 'wb')
 
-            
-        self._do_perform()
-        
-
-
-        if _was_filename:
-            # close it up
+        try:            
+            self._do_perform()
+        except URLGrabError, e:
             self.fo.flush()
             self.fo.close()
-            # set the time
-            mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
-            if mod_time != -1:
-                os.utime(self.filename, (mod_time, mod_time))
+            raise e
+    
+        if _was_filename:
+            self._do_close_fo()
             # re open it
-            self.fo = open(self.filename, 'r')
+            try:
+                self.fo = open(self.filename, 'r')
+            except IOError, e:
+                err = URLGrabError(16, _(\
+                  'error opening file from %s, IOError: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+                
         else:
             #self.fo = open(self._temp_name, 'r')
             self.fo.seek(0)
 
         self._complete = True
     
+    def _do_open_fo(self):
+        self._prog_reportname = str(self.filename)
+        self._prog_basename = os.path.basename(self.filename)
+        if self.append: mode = 'ab'
+        else: mode = 'wb'
+
+        if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+                             (self.filename, mode))
+        try:
+            self.fo = open(self.filename, mode)
+        except IOError, e:
+            err = URLGrabError(16, _(\
+              'error opening local file from %s, IOError: %s') % (self.url, e))
+            err.url = self.url
+            raise err
+
+    def _do_close_fo(self):
+        # close it up
+        self.fo.flush()
+        self.fo.close()
+
+        # Set the URL where we got it from:
+        if xattr is not None:
+            # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+            try:
+                xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+            except:
+                pass # URL too long. = IOError ... ignore everything.
+
+        # set the time
+        mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+        if mod_time != -1:
+            try:
+                os.utime(self.filename, (mod_time, mod_time))
+            except OSError, e:
+                err = URLGrabError(16, _(\
+                  'error setting timestamp on file %s from %s, OSError: %s') 
+                          % (self.filename, self.url, e))
+                err.url = self.url
+                raise err
+
     def _fill_buffer(self, amt=None):
         """fill the buffer to contain at least 'amt' bytes by reading
         from the underlying file object.  If amt is None, then it will
@@ -1526,17 +1755,20 @@ class PyCurlFileObject():
             if self._prog_running:
                 downloaded += self._reget_length
                 self.opts.progress_obj.update(downloaded)
-        except KeyboardInterrupt:
+        except (KeyboardInterrupt, IOError):
             return -1
     
     def _over_max_size(self, cur, max_size=None):
 
         if not max_size:
-            max_size = self.size
-        if self.opts.size: # if we set an opts size use that, no matter what
-            max_size = self.opts.size
+            if not self.opts.size:
+                max_size = self.size
+            else:
+                max_size = self.opts.size
+
         if not max_size: return False # if we have None for all of the Max then this is dumb
-        if cur > max_size + max_size*.10:
+
+        if cur > int(float(max_size) * 1.10):
 
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
@@ -1544,13 +1776,6 @@ class PyCurlFileObject():
             return True
         return False
         
-    def _to_utf8(self, obj, errors='replace'):
-        '''convert 'unicode' to an encoded utf-8 byte string '''
-        # stolen from yum.i18n
-        if isinstance(obj, unicode):
-            obj = obj.encode('utf-8', errors)
-        return obj
-        
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None:
@@ -1582,9 +1807,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
         
-
+    def geturl(self):
+        """ Provide the geturl() method, used to be got from
+            urllib.addinfourl, via. urllib.URLopener.* """
+        return self.url
+        
 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
 
+def reset_curl_obj():
+    """To make sure curl has reread the network/dns info we force a reload"""
+    global _curl_cache
+    _curl_cache.close()
+    _curl_cache = pycurl.Curl()
+
+_libproxy_cache = None
+    
 
 #####################################################################
 # DEPRECATED FUNCTIONS
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..8731aed 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -90,7 +90,7 @@ CUSTOMIZATION
 import random
 import thread  # needed for locking to make this threadsafe
 
-from grabber import URLGrabError, CallbackObject, DEBUG
+from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
 
 def _(st): 
     return st
@@ -263,7 +263,8 @@ class MirrorGroup:
     def _parse_mirrors(self, mirrors):
         parsed_mirrors = []
         for m in mirrors:
-            if type(m) == type(''): m = {'mirror': m}
+            if isinstance(m, basestring):
+                m = {'mirror': _to_utf8(m)}
             parsed_mirrors.append(m)
         return parsed_mirrors
     
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..3d7e99a 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0):
 #        4. +                     ( 5, total: 32)
 #
 
+def _term_add_bar(tl, bar_max_length, pc):
+    blen = bar_max_length
+    bar  = '='*int(blen * pc)
+    if (blen * pc) - int(blen * pc) >= 0.5:
+        bar += '-'
+    return tl.add(' [%-*.*s]' % (blen, blen, bar))
+
+def _term_add_end(tl, osize, size):
+    if osize is not None:
+        if size > osize: # Is ??? better? Really need something to say < vs >.
+            return tl.add(' !!! '), True
+        elif size != osize:
+            return tl.add(' ... '), True
+    return tl.add(' ' * 5), False
+
 class TextMeter(BaseMeter):
     def __init__(self, fo=sys.stderr):
         BaseMeter.__init__(self)
@@ -259,13 +274,10 @@ class TextMeter(BaseMeter):
             ui_rate = tl.add(' %5sB/s' % ave_dl)
             # Make text grow a bit before we start growing the bar too
             blen = 4 + tl.rest_split(8 + 8 + 4)
-            bar  = '='*int(blen * frac)
-            if (blen * frac) - int(blen * frac) >= 0.5:
-                bar += '-'
-            ui_bar  = tl.add(' [%-*.*s]' % (blen, blen, bar))
-            out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
-                                              ui_sofar_pc, ui_pc, ui_bar,
-                                              ui_rate, ui_size, ui_time, ui_end)
+            ui_bar = _term_add_bar(tl, blen, frac)
+            out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+                                                ui_sofar_pc, ui_pc, ui_bar,
+                                                ui_rate,ui_size,ui_time, ui_end)
 
         self.fo.write(out)
         self.fo.flush()
@@ -284,12 +296,7 @@ class TextMeter(BaseMeter):
         tl = TerminalLine(8)
         ui_size = tl.add(' | %5sB' % total_size)
         ui_time = tl.add(' %9s' % total_time)
-        not_done = self.size is not None and amount_read != self.size
-        if not_done:
-            ui_end  = tl.add(' ... ')
-        else:
-            ui_end  = tl.add(' ' * 5)
-
+        ui_end, not_done = _term_add_end(tl, self.size, amount_read)
         out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
                                     ui_size, ui_time, ui_end)
         self.fo.write(out)
@@ -331,12 +338,21 @@ class MultiFileHelper(BaseMeter):
     def message(self, message):
         self.master.message_meter(self, message)
 
+class _FakeLock:
+    def acquire(self):
+        pass
+    def release(self):
+        pass
+
 class MultiFileMeter:
     helperclass = MultiFileHelper
-    def __init__(self):
+    def __init__(self, threaded=True):
         self.meters = []
         self.in_progress_meters = []
-        self._lock = thread.allocate_lock()
+        if threaded:
+            self._lock = thread.allocate_lock()
+        else:
+            self._lock = _FakeLock()
         self.update_period = 0.3 # seconds
         
         self.numfiles         = None
@@ -369,6 +385,7 @@ class MultiFileMeter:
 
     def end(self, now=None):
         if now is None: now = time.time()
+        self.re.update(self._amount_read(), now)
         self._do_end(now)
         
     def _do_end(self, now):
@@ -466,11 +483,20 @@ class MultiFileMeter:
 
 
 class TextMultiFileMeter(MultiFileMeter):
-    def __init__(self, fo=sys.stderr):
+    def __init__(self, fo=sys.stderr, threaded=True):
         self.fo = fo
-        MultiFileMeter.__init__(self)
+        MultiFileMeter.__init__(self, threaded)
 
     # files: ###/### ###%  data: ######/###### ###%  time: ##:##:##/##:##:##
+# New output, like TextMeter output...
+#       update: Size, All files
+#       -----------------------
+# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
+#                          8-22 1 3-4 1 6-12 1   8 3     6 1        9 1  3 1
+#       end
+#       ---
+# <text>                                 | <file size> <file elapsed time> 
+#  8-56                                  3          6 1                 9 5
     def _do_update_meter(self, meter, now):
         self._lock.acquire()
         try:
@@ -480,7 +506,7 @@ class TextMultiFileMeter(MultiFileMeter):
             tf = self.numfiles or 1
             pf = 100 * float(df)/tf + 0.49
             dd = self.re.last_amount_read
-            td = self.total_size
+            td = self.re.total
             pd = 100 * (self.re.fraction_read() or 0) + 0.49
             dt = self.re.elapsed_time()
             rt = self.re.remaining_time()
@@ -491,9 +517,33 @@ class TextMultiFileMeter(MultiFileMeter):
             ftd = format_number(td) + 'B'
             fdt = format_time(dt, 1)
             ftt = format_time(tt, 1)
-            
-            out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
-            self.fo.write('\r' + out)
+
+            frac = self.re.fraction_read() or 0
+            ave_dl = format_number(self.re.average_rate())
+            text = meter.text or meter.basename
+            if tf > 1:
+                text = '(%u/%u): %s' % (df+1, tf, text)
+
+            # Include text + ui_rate in minimal
+            tl = TerminalLine(8, 8+1+8)
+
+            ui_size = tl.add(' | %5sB' % format_number(dd))
+
+            ui_time = tl.add(' %9s' % format_time(rt))
+            ui_end  = tl.add(' ETA ')
+
+            ui_sofar_pc = tl.add(' %i%%' % pf,
+                                 full_len=len(" (100%)"))
+            ui_rate = tl.add(' %5sB/s' % ave_dl)
+
+            # Make text grow a bit before we start growing the bar too
+            blen = 4 + tl.rest_split(8 + 8 + 4)
+            ui_bar = _term_add_bar(tl, blen, frac)
+            out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+                                              ui_sofar_pc, ui_bar,
+                                              ui_rate, ui_size, ui_time,
+                                              ui_end)
+            self.fo.write(out)
             self.fo.flush()
         finally:
             self._lock.release()
@@ -502,15 +552,28 @@ class TextMultiFileMeter(MultiFileMeter):
         self._lock.acquire()
         try:
             format = "%-30.30s %6.6s    %8.8s    %9.9s"
-            fn = meter.basename
+            fn = meter.text or meter.basename
             size = meter.last_amount_read
             fsize = format_number(size) + 'B'
             et = meter.re.elapsed_time()
             fet = format_time(et, 1)
-            frate = format_number(size / et) + 'B/s'
-            
-            out = '%-79.79s' % (format % (fn, fsize, fet, frate))
-            self.fo.write('\r' + out + '\n')
+            frate = format_number(et and size / et) + 'B/s'
+            df = self.finished_files
+            tf = self.numfiles or 1
+
+            total_time = format_time(et)
+            total_size = format_number(size)
+            text = meter.text or meter.basename
+            if tf > 1:
+                text = '(%u/%u): %s' % (df, tf, text)
+
+            tl = TerminalLine(8)
+            ui_size = tl.add(' | %5sB' % total_size)
+            ui_time = tl.add(' %9s' % total_time)
+            ui_end, not_done = _term_add_end(tl, meter.size, size)
+            out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+                                        ui_size, ui_time, ui_end)
+            self.fo.write(out)
         finally:
             self._lock.release()
         self._do_update_meter(meter, now)
@@ -658,6 +721,8 @@ def format_time(seconds, use_hours=0):
     if seconds is None or seconds < 0:
         if use_hours: return '--:--:--'
         else:         return '--:--'
+    elif seconds == float('inf'):
+        return 'Infinite'
     else:
         seconds = int(seconds)
         minutes = seconds / 60
rpms / python-urlgrabber

Source Code

Files