Skip to content

Commit

Permalink
use thread for PDFBOX to make the app responsive
Browse files Browse the repository at this point in the history
  • Loading branch information
xulihang committed Jun 5, 2019
1 parent 3f5491e commit b7d6d95
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 65 deletions.
48 changes: 24 additions & 24 deletions BasicCAT/BasicCAT.b4j
Original file line number Diff line number Diff line change
Expand Up @@ -207,32 +207,32 @@ Module38=Map2Xml
Module39=SRX
Module4=CustomListView
Module40=POIWord
Module41=pdfbox
Module42=git
Module43=dictWebView
Module44=FontPicker
Module45=TermEditor
Module46=TermManager
Module47=HistoryViewer
Module48=ClientKVS
Module49=statistics
Module41=git
Module42=dictWebView
Module43=FontPicker
Module44=TermEditor
Module45=TermManager
Module46=HistoryViewer
Module47=ClientKVS
Module48=statistics
Module49=LanguagePairSelector
Module5=txtFilter
Module50=LanguagePairSelector
Module51=idmlUtils
Module52=editDistance
Module53=InputBox
Module54=MTParamsFiller
Module55=filterGenericUtils
Module56=TBX
Module57=CallSubUtils
Module58=viewSegment
Module59=serverLauncher
Module50=idmlUtils
Module51=editDistance
Module52=InputBox
Module53=MTParamsFiller
Module54=filterGenericUtils
Module55=TBX
Module56=CallSubUtils
Module57=viewSegment
Module58=serverLauncher
Module59=languageChooser
Module6=TM
Module60=languageChooser
Module61=TMEditor
Module62=TextFlow
Module63=opennlp
Module64=coordinate
Module60=TMEditor
Module61=TextFlow
Module62=opennlp
Module63=coordinate
Module64=pdfbox
Module7=Term
Module8=KeyValueStore
Module9=ProjectSettings
Expand Down
2 changes: 1 addition & 1 deletion BasicCAT/BasicCAT.b4j.meta
Original file line number Diff line number Diff line change
Expand Up @@ -194,4 +194,4 @@ ModuleClosedNodes7=
ModuleClosedNodes8=
ModuleClosedNodes9=
SelectedBuild=0
VisibleModules=1,2,3,4,5,6,7,8,9,10,42,15,12,21
VisibleModules=1,2,3,4,5,6,7,8,9,10,41,15,12,21,64,36,59
40 changes: 21 additions & 19 deletions BasicCAT/PDF2TXT.bas
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,16 @@ Sub StripButton_MouseClicked (EventData As MouseEvent)
fx.Msgbox(frm,"Please choose a pdf file first.","")
Return
End If
Dim pdfbox1 As pdfbox
pdfbox1.Initialize(label1.Text)
If IncludePageNumCheckBox.Checked Then
If offsetTextField.Text="" Or PageAffixTextField.Text="" Then
fx.Msgbox(frm,"Please fill affix and offset first.","")
Return
End If
TextArea1.text=pdfbox.stripPDFText(label1.Text,True,FacingPageCheckBox.Checked,PageAffixTextField.Text,offsetTextField.Text)
TextArea1.text=pdfbox1.stripPDFText(True,FacingPageCheckBox.Checked,PageAffixTextField.Text,offsetTextField.Text)
Else
TextArea1.text=pdfbox.stripPDFText(label1.Text,False,False,"",0)
TextArea1.text=pdfbox1.stripPDFText(False,False,"",0)
End If
End Sub

Expand Down Expand Up @@ -117,26 +119,23 @@ Sub ocrButton_MouseClicked (EventData As MouseEvent)

Dim lc As languageChooser
lc.Initialize
Dim langs As List
langs=lc.ShowAndWait
Dim langsParam As String
For Each chkBox As CheckBox In langs
If chkBox.Checked Then
langsParam=langsParam&chkBox.Text&"+"
End If
Next
If langsParam.EndsWith("+") Then
langsParam=langsParam.SubString2(0,langsParam.Length-1)
End If
Log(langsParam)
Dim langsParam As String=lc.ShowAndWait
If langsParam="" Then
Return
End If

Label2.Text="Converting pdf to images..."
Dim files As List
wait for (pdfbox.getImage(File.GetFileParent(label1.Text),File.GetName(label1.Text))) complete (result As List)
files=result
files.Initialize
Dim dir As String=File.GetFileParent(label1.Text)
Dim filename As String=File.GetName(label1.Text)
Dim pdfbox1 As pdfbox
pdfbox1.Initialize(File.Combine(dir,filename))
wait for (pdfbox1.getImageAsync()) complete (result As Object)
For i=0 To pdfbox1.PageNum-1
files.Add(File.Combine(dir,i&".jpg"))
Next

Label2.Text="OCRing..."
If IncludePageNumCheckBox.Checked Then
wait for (ocrWithPagenum(files,langsParam,PageAffixTextField.Text,offsetTextField.Text)) complete (text As String)
Expand Down Expand Up @@ -200,7 +199,8 @@ Sub ocrWithPagenum(files As List,langsParam As String,affix As String,offset As
Else
path="tesseract"
End If
Dim content As String
Dim contentSB As StringBuilder
contentSB.Initialize
Dim pdfnum As Int=0
For i=0 To files.Size-1
pdfnum=pdfnum+1
Expand Down Expand Up @@ -228,15 +228,17 @@ Sub ocrWithPagenum(files As List,langsParam As String,affix As String,offset As
If Success And ExitCode = 0 Then
Log("Success")
Log(StdOut)
content=content&pageStart&CRLF&CRLF&removeLines(File.ReadString(dir,i&".txt"))
contentSB.Append(pageStart).Append(CRLF)
contentSB.Append(CRLF).Append(File.ReadString(dir,i&".txt"))
'removeLines
Else
Log("Error: " & StdErr)
End If


Next

Return content
Return contentSB.ToString
End Sub

Sub ocr(files As List,langsParam As String) As ResumableSub
Expand Down
5 changes: 3 additions & 2 deletions BasicCAT/Project.bas
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ Public Sub commitAndPush(commitMessage As String)
End If
End If
Else
wait for (updateLocalFileBasedonFetch(username,password,email)) Complete (success as Object)
wait for (updateLocalFileBasedonFetch(username,password,email)) Complete (success As Object)
Dim diffList As List
diffList=projectGit.diffList
Log(diffList)
Expand Down Expand Up @@ -575,7 +575,7 @@ Sub samelocalHeadAndRemoteHead(username As String,password As String,fetch As Bo
Return result
End Sub

Sub updateLocalFileBasedonFetch(username As String,password As String,email As String) as ResumableSub
Sub updateLocalFileBasedonFetch(username As String,password As String,email As String) As ResumableSub
wait for (samelocalHeadAndRemoteHead(username,password,True)) Complete (isSame As Boolean)
If isSame = False Then
Dim localHead,remoteHead As String
Expand Down Expand Up @@ -637,6 +637,7 @@ Sub updateLocalFileBasedonFetch(username As String,password As String,email As S
End If

Log("worddir,after: "&projectGit.getWorkdirPath)
return True
End Sub

Sub updateWorkFile(filename As String) As Boolean
Expand Down
19 changes: 17 additions & 2 deletions BasicCAT/languageChooser.bas
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,26 @@ Public Sub Initialize
Next
End Sub

Public Sub ShowAndWait As List
Public Sub ShowAndWait As String
frm.ShowAndWait
Return ListView1.Items
Dim langsParam As String
For Each chkBox As CheckBox In ListView1.Items
If chkBox.Checked Then
langsParam=langsParam&chkBox.Text&"+"
End If
Next
If langsParam.EndsWith("+") Then
langsParam=langsParam.SubString2(0,langsParam.Length-1)
End If
Log(langsParam)
Return langsParam
End Sub

Sub OKButton_MouseClicked (EventData As MouseEvent)
frm.Close
End Sub

Sub frm_CloseRequest (EventData As Event)
ListView1.Items.Clear
frm.Close
End Sub
57 changes: 40 additions & 17 deletions BasicCAT/pdfbox.bas
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
B4J=true
Group=Default Group
ModulesStructureVersion=1
Type=StaticCode
Version=6.51
Type=Class
Version=7.32
@EndOfDesignText@
'Static code module
Sub Process_Globals
Sub Class_Globals
Private fx As JFX
Private th As Thread
Private doc As JavaObject
Private path As String
End Sub

Sub stripPDFText(filepath As String, includePageNum As Boolean,isFacingPage As Boolean,affix As String,offset As Int) As String
'Initializes the object. You can add parameters to this method if needed.
Public Sub Initialize(filePath As String)
th.Initialise("th")
path=filePath
Dim PDDocument As JavaObject
PDDocument.InitializeStatic("org.apache.pdfbox.pdmodel.PDDocument")
Dim doc As JavaObject
doc=PDDocument.RunMethodJO("load",Array(getFile(filepath)))
doc=PDDocument.RunMethodJO("load",Array(getFile(filePath)))
End Sub

Public Sub stripPDFText(includePageNum As Boolean,isFacingPage As Boolean,affix As String,offset As Int) As String

Dim PDFTextStripper As JavaObject
PDFTextStripper.InitializeNewInstance("org.apache.pdfbox.text.PDFTextStripper",Null)
Dim pageNum As Int
Expand Down Expand Up @@ -45,27 +53,43 @@ Sub stripPDFText(filepath As String, includePageNum As Boolean,isFacingPage As B
Return text
End Sub

Sub getImage(dir As String,filename As String) As ResumableSub

Public Sub getPageNum As Int
Dim files As List
files.Initialize
SetSystemProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
Dim pageNum As Int
pageNum=doc.RunMethod("getNumberOfPages",Null)
Return pageNum
End Sub

Public Sub getImageAsync As ResumableSub
th.Start(Me,"getImage",Array As Object("placeholder"))
wait for th_Ended(endedOK As Boolean, error As String)
Log(endedOK)
Return endedOK
End Sub

Public Sub getImage(param As String)
Dim files As List
files.Initialize
SetSystemProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
Dim PDDocument As JavaObject
PDDocument.InitializeStatic("org.apache.pdfbox.pdmodel.PDDocument")
Dim doc As JavaObject
doc=PDDocument.RunMethodJO("load",Array(getFile(File.Combine(dir,filename))))
Dim pageNum As Int
pageNum=doc.RunMethod("getNumberOfPages",Null)
Dim PDFRenderer As JavaObject
PDFRenderer.InitializeNewInstance("org.apache.pdfbox.rendering.PDFRenderer",Array(doc))
For i=0 To pageNum-1
Log(i)
Sleep(0)
renderImageToFile(PDFRenderer,files,dir,i)
'Sleep(0)
'files.Add(File.Combine(dir,i&".jpg"))
renderImageToFile(PDFRenderer,File.GetFileParent(path),i)
Next
Return files
'Return files
End Sub

Sub renderImageToFile(PDFRenderer As JavaObject,files As List,dir As String,i As Int)
Sub renderImageToFile(PDFRenderer As JavaObject,dir As String,i As Int)
Dim bi As JavaObject
Dim dpi As Float
dpi=150
Expand All @@ -76,11 +100,10 @@ Sub renderImageToFile(PDFRenderer As JavaObject,files As List,dir As String,i As
imageIO.InitializeStatic("javax.imageio.ImageIO")
imageIO.RunMethod("write",Array(bi,"jpg",out))
out.Close
files.Add(File.Combine(dir,i&".jpg"))
End Sub

Sub getFile(path As String) As JavaObject
Sub getFile(filepath As String) As JavaObject
Dim fileJO As JavaObject
fileJO.InitializeNewInstance("java.io.File",Array(path))
fileJO.InitializeNewInstance("java.io.File",Array(filepath))
Return fileJO
End Sub

0 comments on commit b7d6d95

Please sign in to comment.