• R/O
  • HTTP
  • SSH
  • HTTPS

libre10: Commit

libre10 git


Commit MetaInfo

Revision99034d9f27d5d09cdac180af64930e24917149c5 (tree)
Zeit2013-11-19 01:07:04
Autorgn64_jp <gn64@rec1...>
Commitergn64_jp

Log Message

fix OCR pdf including multiple space between characters.

Ändern Zusammenfassung

Diff

--- a/index/pdf2xml.py
+++ b/index/pdf2xml.py
@@ -66,6 +66,9 @@ create table pdffile (
6666 dbcon.execute(sql)
6767 except:
6868 ""
69+rex1=re.compile(ur'[\n<>&\x0c]',ur'')
70+rex2=re.compile(ur'([^0-9a-zA-Z.,_\-])[/s ]+?([^0-9a-zA-Z.,_\-])')
71+rex3=re.compile('\x00')
6972 def index_rebuild():
7073 sql=u"SELECT id,title,part,startpage,endpage,page FROM pdffile"
7174 cur = dbcon.execute(sql)
@@ -134,11 +137,11 @@ def PDF2TEXT(pdfpath,dstpath,idnum):
134137 def TEXT2solr(solrcon,titletxt,textpath,pagenum,pagemax,pdfpath,idnum):
135138 f=codecs.open(textpath,"r","utf-8","ignore")
136139 data1=f.read()
137- data1=data1.replace(u"\n",u" ")
138- data1=data1.replace(u"<",u"")
139- data1=data1.replace(u">",u"")
140- data1=data1.replace(u"&","")
141- data1=data1.replace('\x0c',u"")
140+ data1=rex1.sub(u'',data1)
141+ data1=rex2.sub(ur'\1\2',data1)
142+ data1=rex3.sub(u'',data1)
143+ data1=rex2.sub(ur'\1\2',data1)
144+ data1=rex3.sub(u'',data1)
142145 title_g=titletxt.split("_Part")[0]
143146 title_g_id=hashlib.sha224(title_g.encode("utf-8")).hexdigest()
144147 print titletxt+" : "+str(pagenum)+"/"+str(pagemax)
Show on old repository browser