-
Notifications
You must be signed in to change notification settings - Fork 136
/
Copy pathDocumentPipeline.py
43 lines (41 loc) · 1.95 KB
/
DocumentPipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
######################################
# Example Document Processing pipeline
######################################
from org.myrobotlab.service import Runtime
from org.myrobotlab.document.transformer import WorkflowConfiguration
from org.myrobotlab.document.transformer import StageConfiguration
# create the pipeline service
pipeline = Runtime.createAndStart("docproc", "DocumentPipeline")
# create a pipeline
# pipeline.workflowName = "default";
# create a workflow to load into that pipeline service
workflowConfig = WorkflowConfiguration();
workflowConfig.setName("default");
staticFieldStageConfig = StageConfiguration();
staticFieldStageConfig.setStageClass("org.myrobotlab.document.transformer.SetStaticFieldValue");
staticFieldStageConfig.setStageName("SetTableField");
# statically assign the value of "MRL" to the field "table" on the document
staticFieldStageConfig.setStringParam("table", "MRL");
workflowConfig.addStage(staticFieldStageConfig);
# a stage that sends a document to solr
openNLPConfig = StageConfiguration()
openNLPConfig.setStageClass("org.myrobotlab.document.transformer.OpenNLP")
openNLPConfig.setStageName("OpenNLP")
openNLPConfig.setStringParam("textField","description")
workflowConfig.addStage(openNLPConfig)
sendToSolrConfig = StageConfiguration();
sendToSolrConfig.setStageClass("org.myrobotlab.document.transformer.SendToSolr")
sendToSolrConfig.setStageName("SendToSolr")
sendToSolrConfig.setStringParam("solrUrl", "http://www.skizatch.org:8983/solr/graph")
workflowConfig.addStage(sendToSolrConfig)
# set the config on the pipeline service
pipeline.setConfig(workflowConfig)
# initialize the pipeline (load the config)
pipeline.initalize()
# create a connector that crawls MyRobotLab RSS url
rss = Runtime.createAndStart("rss", "RSSConnector")
# Attach the output of the rss connector to the pipeline
rss.addDocumentListener(pipeline)
# tell the RSS connector to start crawling the site
rss.startCrawling()
# connector issues a flush when it's done crawling.