libre10 git
Revision | 99034d9f27d5d09cdac180af64930e24917149c5 (tree) |
---|---|
Zeit | 2013-11-19 01:07:04 |
Autor | gn64_jp <gn64@rec1...> |
Commiter | gn64_jp |
fix OCR pdf including multiple space between characters.
@@ -66,6 +66,9 @@ create table pdffile ( | ||
66 | 66 | dbcon.execute(sql) |
67 | 67 | except: |
68 | 68 | "" |
69 | +rex1=re.compile(ur'[\n<>&\x0c]',ur'') | |
70 | +rex2=re.compile(ur'([^0-9a-zA-Z.,_\-])[/s ]+?([^0-9a-zA-Z.,_\-])') | |
71 | +rex3=re.compile('\x00') | |
69 | 72 | def index_rebuild(): |
70 | 73 | sql=u"SELECT id,title,part,startpage,endpage,page FROM pdffile" |
71 | 74 | cur = dbcon.execute(sql) |
@@ -134,11 +137,11 @@ def PDF2TEXT(pdfpath,dstpath,idnum): | ||
134 | 137 | def TEXT2solr(solrcon,titletxt,textpath,pagenum,pagemax,pdfpath,idnum): |
135 | 138 | f=codecs.open(textpath,"r","utf-8","ignore") |
136 | 139 | data1=f.read() |
137 | - data1=data1.replace(u"\n",u" ") | |
138 | - data1=data1.replace(u"<",u"") | |
139 | - data1=data1.replace(u">",u"") | |
140 | - data1=data1.replace(u"&","") | |
141 | - data1=data1.replace('\x0c',u"") | |
140 | + data1=rex1.sub(u'',data1) | |
141 | + data1=rex2.sub(ur'\1\2',data1) | |
142 | + data1=rex3.sub(u'',data1) | |
143 | + data1=rex2.sub(ur'\1\2',data1) | |
144 | + data1=rex3.sub(u'',data1) | |
142 | 145 | title_g=titletxt.split("_Part")[0] |
143 | 146 | title_g_id=hashlib.sha224(title_g.encode("utf-8")).hexdigest() |
144 | 147 | print titletxt+" : "+str(pagenum)+"/"+str(pagemax) |