--- chm2pdf.orig 2008-07-09 14:42:26.000000000 +0400
+++ chm2pdf 2013-05-18 16:16:48.097014228 +0400
@@ -115,8 +115,14 @@
urls_list=[]
for line in flist.readlines()[3:]:
#print 'line',line
- spline=line.split()
- urls_list.append(spline[5])
+ #This won't work if internal paths of CHM contains spaces: e.g. /doc space/ will only become /doc
+ #spline=line.split()
+ #urls_list.append(spline[5])
+ #this should work better:
+ spline= re.sub(r".*?normal file\s*(.*?)\n$", "\\1", line)
+ if spline[0]=="/":
+ #print "got spline="+spline
+ urls_list.append( spline)
flist.close()
# os.remove(CHM2PDF_WORK_DIR+'/urlslist.txt')
@@ -148,13 +154,17 @@
img_filename = ''
for item in objective_urls:
- if iurl in item:
+ #objective_urls has "real path", whereas image_catcher.imgurls can contain %20!
+ #e.g. item='/doc space/image path/velocity space.gif iurl=image%20path/velocity%20space.gif
+ iiurl= re.sub('%20',' ',iurl)
+ if iiurl in item:
img_filename=CHM2PDF_ORIG_DIR+item
if ';' in img_filename: #hack to get rid of mysterious ; in filenames and urls...
img_filename=img_filename.split(';')[0]
# substitute the new image filenames - but only if an img_filename was found!
if img_filename:
- page=re.sub(iurl,img_filename,page)
+ #r = Python also has "raw strings" which do not apply special treatment to backslashes
+ page=re.sub(r'(?i)"'+iurl,'"'+re.sub('\\\\ ', ' ', img_filename),page)
# We substitute the CSS URLs of input_file with the *actual* URLs on the CHM2PDF_ORIG_DIR directory
@@ -459,6 +469,10 @@
page=pf.read()
pf.close()
+ # Some names contain a '%20' (an HTML code for a space). We substitute with a "real space"
+ # otherwise we won't be able to match to the real files.
+ page = re.sub('%20',' ',page)
+
# Substitutions in 1st pass: we replace the original filenames with their corresponding "garbled" equivalents.
for match_string in match_strings:
replace_string = replace_garbled_strings[match_strings.index(match_string)]