--- chm2pdf.orig 2008-07-09 14:42:26.000000000 +0400 +++ chm2pdf 2013-05-18 16:16:48.097014228 +0400 @@ -115,8 +115,14 @@ urls_list=[] for line in flist.readlines()[3:]: #print 'line',line - spline=line.split() - urls_list.append(spline[5]) + #This won't work if internal paths of CHM contains spaces: e.g. /doc space/ will only become /doc + #spline=line.split() + #urls_list.append(spline[5]) + #this should work better: + spline= re.sub(r".*?normal file\s*(.*?)\n$", "\\1", line) + if spline[0]=="/": + #print "got spline="+spline + urls_list.append( spline) flist.close() # os.remove(CHM2PDF_WORK_DIR+'/urlslist.txt') @@ -148,13 +154,17 @@ img_filename = '' for item in objective_urls: - if iurl in item: + #objective_urls has "real path", whereas image_catcher.imgurls can contain %20! + #e.g. item='/doc space/image path/velocity space.gif iurl=image%20path/velocity%20space.gif + iiurl= re.sub('%20',' ',iurl) + if iiurl in item: img_filename=CHM2PDF_ORIG_DIR+item if ';' in img_filename: #hack to get rid of mysterious ; in filenames and urls... img_filename=img_filename.split(';')[0] # substitute the new image filenames - but only if an img_filename was found! if img_filename: - page=re.sub(iurl,img_filename,page) + #r = Python also has "raw strings" which do not apply special treatment to backslashes + page=re.sub(r'(?i)"'+iurl,'"'+re.sub('\\\\ ', ' ', img_filename),page) # We substitute the CSS URLs of input_file with the *actual* URLs on the CHM2PDF_ORIG_DIR directory @@ -459,6 +469,10 @@ page=pf.read() pf.close() + # Some names contain a '%20' (an HTML code for a space). We substitute with a "real space" + # otherwise we won't be able to match to the real files. + page = re.sub('%20',' ',page) + # Substitutions in 1st pass: we replace the original filenames with their corresponding "garbled" equivalents. for match_string in match_strings: replace_string = replace_garbled_strings[match_strings.index(match_string)]