Skip to content

Commit 46c92e5

Browse files
Updated fin/leg ChunkMapers model card (JohnSnowLabs#13482)
1 parent 79d12ad commit 46c92e5

12 files changed

+343
-318
lines changed

docs/_posts/bunyamin-polat/2023-01-19-finmapper_nasdaq_ticker_stock_screener_en.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ ner_converter = nlp.NerConverter()\
7474

7575
CM = finance.ChunkMapperModel.pretrained('finmapper_nasdaq_ticker_stock_screener', 'en', 'finance/models')\
7676
.setInputCols(["ner_chunk"])\
77-
.setOutputCol("mappings")
77+
.setOutputCol("mappings")\
78+
.setEnableFuzzyMatching(True)\
7879

7980
pipeline = nlp.Pipeline().setStages([document_assembler,
8081
tokenizer,

docs/_posts/josejuanmartinez/2022-08-09-finmapper_nasdaq_companyname_en_3_2.md

Lines changed: 29 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -41,56 +41,37 @@ It can be optionally combined with Entity Resolution to normalize first the name
4141
```python
4242

4343
document_assembler = nlp.DocumentAssembler()\
44-
.setInputCol('text')\
45-
.setOutputCol('document')
44+
.setInputCol('text')\
45+
.setOutputCol('document')
4646

4747
tokenizer = nlp.Tokenizer()\
48-
.setInputCols("document")\
49-
.setOutputCol("token")
48+
.setInputCols("document")\
49+
.setOutputCol("token")
5050

5151
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
52-
.setInputCols(["document", "token"]) \
53-
.setOutputCol("embeddings")
52+
.setInputCols(["document", "token"])\
53+
.setOutputCol("embeddings")
5454

5555
ner_model = finance.NerModel.pretrained('finner_orgs_prods_alias', 'en', 'finance/models')\
56-
.setInputCols(["document", "token", "embeddings"])\
57-
.setOutputCol("ner")
58-
56+
.setInputCols(["document", "token", "embeddings"])\
57+
.setOutputCol("ner")
58+
5959
ner_converter = nlp.NerConverter()\
60-
.setInputCols(["document", "token", "ner"])\
61-
.setOutputCol("ner_chunk")
62-
63-
# Optional: To normalize the ORG name using NASDAQ data before the mapping
64-
##########################################################################
65-
chunkToDoc = nlp.Chunk2Doc()\
66-
.setInputCols("ner_chunk")\
67-
.setOutputCol("ner_chunk_doc")
68-
69-
chunk_embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use_lg", "en")\
70-
.setInputCols(["ner_chunk_doc"])\
71-
.setOutputCol("chunk_embeddings")
72-
73-
use_er_model = finance.SentenceEntityResolverModel.pretrained('finel_nasdaq_data_company_name', 'en', 'finance/models')\
74-
.setInputCols("chunk_embeddings")\
75-
.setOutputCol('normalized')\
76-
.setDistanceFunction("EUCLIDEAN")
77-
##########################################################################
78-
79-
CM = finance.ChunkMapperModel()\
80-
.pretrained('finmapper_nasdaq_companyname', 'en', 'finance/models')\
81-
.setInputCols(["normalized"])\ #or ner_chunk without normalization
82-
.setOutputCol("mappings")
60+
.setInputCols(["document", "token", "ner"])\
61+
.setOutputCol("ner_chunk")
62+
63+
CM = finance.ChunkMapperModel().pretrained('finmapper_nasdaq_companyname', 'en', 'finance/models')\
64+
.setInputCols(["ner_chunk"])\
65+
.setOutputCol("mappings")\
66+
.setEnableFuzzyMatching(True)
8367

8468
pipeline = nlp.Pipeline().setStages([document_assembler,
85-
tokenizer,
86-
embeddings,
87-
ner_model,
88-
ner_converter,
89-
chunkToDoc, # Optional for normalization
90-
chunk_embeddings, # Optional for normalization
91-
use_er_model, # Optional for normalization
92-
CM])
93-
69+
tokenizer,
70+
embeddings,
71+
ner_model,
72+
ner_converter,
73+
CM])
74+
9475
text = """Altaba Inc. is a company which ..."""
9576

9677
test_data = spark.createDataFrame([[text]]).toDF("text")
@@ -107,7 +88,13 @@ lp.fullAnnotate(text)
10788
## Results
10889

10990
```bash
110-
[Row(mappings=[Row(annotatorType='labeled_dependency', begin=0, end=10, result='AABA', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'ticker', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Altaba Inc.', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'company_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Altaba', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'short_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Asset Management', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'industry', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Financial Services', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'sector', 'all_relations': ''}, embeddings=[])])]
91+
{
92+
"ticker": "AABA",
93+
"company_name": "Altaba Inc.",
94+
"short_name": "Altaba",
95+
"industry": "Asset Management",
96+
"sector": "Financial Services"
97+
}
11198
```
11299

113100
{:.model-param}

docs/_posts/josejuanmartinez/2022-08-09-finmapper_nasdaq_ticker_en_3_2.md

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,31 +46,36 @@ tokenizer = nlp.Tokenizer()\
4646
.setOutputCol("token")
4747

4848
tokenClassifier = nlp.RoBertaForTokenClassification.pretrained("finner_roberta_ticker", "en", "finance/models")\
49-
.setInputCols(["document",'token'])\
50-
.setOutputCol("ner")
49+
.setInputCols(["document",'token'])\
50+
.setOutputCol("ner")
5151

5252
ner_converter = nlp.NerConverter()\
5353
.setInputCols(["document", "token", "ner"])\
5454
.setOutputCol("ner_chunk")
5555

56-
CM = finance.ChunkMapperModel()\
57-
.pretrained('finmapper_nasdaq_companyname', 'en', 'finance/models')\
56+
CM = finance.ChunkMapperModel().pretrained('finmapper_nasdaq_ticker', 'en', 'finance/models')\
5857
.setInputCols(["ner_chunk"])\
5958
.setOutputCol("mappings")\
60-
.setRel('company_name')
61-
62-
pipeline = Pipeline().setStages([document_assembler,
63-
tokenizer,
64-
tokenClassifier,
65-
ner_converter,
66-
CM])
59+
.setRel('company_name')\
60+
.setEnableFuzzyMatching(True)
61+
62+
pipeline = nlp.Pipeline().setStages(
63+
[
64+
document_assembler,
65+
tokenizer,
66+
tokenClassifier,
67+
ner_converter,
68+
CM
69+
]
70+
)
6771

6872
text = ["""There are some serious purchases and sales of AMZN stock today."""]
6973

7074
test_data = spark.createDataFrame([text]).toDF("text")
7175

7276
model = pipeline.fit(test_data)
73-
res= model.transform(test_data)
77+
78+
res = model.transform(test_data)
7479

7580
res.select('mappings').collect()
7681
```
@@ -80,7 +85,13 @@ res.select('mappings').collect()
8085
## Results
8186

8287
```bash
83-
[Row(mappings=[Row(annotatorType='labeled_dependency', begin=46, end=49, result='AMZN', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'ticker', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Amazon.com Inc.', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'company_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Amazon.com', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'short_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Retail - Apparel & Specialty', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'industry', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Consumer Cyclical', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'sector', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=57, end=61, result='NONE', metadata={'sentence': '0', 'chunk': '1', 'entity': 'today'}, embeddings=[])])]
88+
{
89+
"ticker": "AMZN",
90+
"company_name": "Amazon.com Inc.",
91+
"short_name": "Amazon.com",
92+
"industry": "Retail - Apparel & Specialty",
93+
"sector": "Consumer Cyclical"
94+
}
8495
```
8596

8697
{:.model-param}

docs/_posts/josejuanmartinez/2022-08-18-finmapper_edgar_companyname_en_3_2.md

Lines changed: 42 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -45,77 +45,74 @@ IMPORTANT: Chunk Mappers work with exact matches, so before using Chunk Mapping,
4545
{% include programmingLanguageSelectScalaPythonNLU.html %}
4646

4747
```python
48-
documentAssembler = nlp.DocumentAssembler()\
49-
.setInputCol("text")\
50-
.setOutputCol("document")
48+
document_assembler = nlp.DocumentAssembler()\
49+
.setInputCol("text")\
50+
.setOutputCol("document")
5151

5252
tokenizer = nlp.Tokenizer()\
53-
.setInputCols(["document"])\
54-
.setOutputCol("token")
53+
.setInputCols(["document"])\
54+
.setOutputCol("token")
5555

5656
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
57-
.setInputCols(["document", "token"]) \
58-
.setOutputCol("embeddings")
57+
.setInputCols(["document", "token"]) \
58+
.setOutputCol("embeddings")
5959

6060
ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias", "en", "finance/models")\
61-
.setInputCols(["document", "token", "embeddings"])\
62-
.setOutputCol("ner")
61+
.setInputCols(["document", "token", "embeddings"])\
62+
.setOutputCol("ner")
6363

6464
ner_converter = nlp.NerConverter()\
65-
.setInputCols(["document","token","ner"])\
66-
.setOutputCol("ner_chunk")
67-
68-
# Optional: To normalize the ORG name using NASDAQ data before the mapping
69-
##########################################################################
70-
chunkToDoc = nlp.Chunk2Doc()\
71-
.setInputCols("ner_chunk")\
72-
.setOutputCol("ner_chunk_doc")
73-
74-
chunk_embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
75-
.setInputCols("ner_chunk_doc") \
76-
.setOutputCol("sentence_embeddings")
77-
78-
use_er_model = finance.SentenceEntityResolverModel.pretrained("finel_edgar_company_name", "en", "finance/models") \
79-
.setInputCols(["ner_chunk_doc", "sentence_embeddings"]) \
80-
.setOutputCol("normalized")\
81-
.setDistanceFunction("EUCLIDEAN")
82-
##########################################################################
83-
84-
cm = finance.ChunkMapperModel()\
85-
.pretrained("finmapper_edgar_companyname", "en", "finance/models")\
86-
.setInputCols(["normalized"])\ # or ner_chunk for non normalized versions
87-
.setOutputCol("mappings")
65+
.setInputCols(["document","token","ner"])\
66+
.setOutputCol("ner_chunk")
67+
68+
cm = finance.ChunkMapperModel().pretrained("finmapper_edgar_companyname", "en", "finance/models")\
69+
.setInputCols(["ner_chunk"])\
70+
.setOutputCol("mappings")\
71+
.setEnableFuzzyMatching(True)
8872

8973
nlpPipeline = nlp.Pipeline(stages=[
90-
documentAssembler,
91-
tokenizer,
92-
embeddings,
93-
ner_model,
94-
ner_converter,
95-
chunkToDoc,
96-
chunk_embeddings,
97-
use_er_model,
98-
cm
74+
document_assembler,
75+
tokenizer,
76+
embeddings,
77+
ner_model,
78+
ner_converter,
79+
cm
9980
])
10081

101-
text = """NIKE Inc is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear,
102-
apparel, equipment, accessories, and services"""
82+
text = """NIKE Inc is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services"""
10383

10484
test_data = spark.createDataFrame([[text]]).toDF("text")
10585

10686
model = nlpPipeline.fit(test_data)
10787

10888
lp = nlp.LightPipeline(model)
10989

110-
lp.annotate(text)
90+
result = lp.fullAnnotate(text)
11191
```
11292

11393
</div>
11494

11595
## Results
11696

11797
```bash
118-
{"mappings": [["labeled_dependency", 0, 22, "Jamestown Invest 1, LLC", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "name", "all_relations": ""}], ["labeled_dependency", 0, 22, "REAL ESTATE INVESTMENT TRUSTS [6798]", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "sic", "all_relations": ""}], ["labeled_dependency", 0, 22, "6798", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "sic_code", "all_relations": ""}], ["labeled_dependency", 0, 22, "831529368", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "irs_number", "all_relations": ""}], ["labeled_dependency", 0, 22, "1231", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "fiscal_year_end", "all_relations": ""}], ["labeled_dependency", 0, 22, "GA", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "state_location", "all_relations": ""}], ["labeled_dependency", 0, 22, "DE", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "state_incorporation", "all_relations": ""}], ["labeled_dependency", 0, 22, "PONCE CITY MARKET", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_street", "all_relations": ""}], ["labeled_dependency", 0, 22, "ATLANTA", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_city", "all_relations": ""}], ["labeled_dependency", 0, 22, "GA", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_state", "all_relations": ""}], ["labeled_dependency", 0, 22, "30308", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_zip", "all_relations": ""}], ["labeled_dependency", 0, 22, "7708051000", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_phone", "all_relations": ""}], ["labeled_dependency", 0, 22, "Jamestown Atlanta Invest 1, LLC", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "former_name", "all_relations": ""}], ["labeled_dependency", 0, 22, "20180824", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "former_name_date", "all_relations": ""}], ["labeled_dependency", 0, 22, "2019-11-21", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "date", "all_relations": "2019-10-24:::2019-11-25:::2019-11-12:::2022-01-13:::2022-03-31:::2022-04-11:::2022-07-12:::2022-06-30:::2021-01-14:::2021-04-06:::2021-03-31:::2021-04-28:::2021-06-30:::2021-09-10:::2021-09-22:::2021-09-30:::2021-10-08:::2020-03-16:::2021-12-30:::2020-04-06:::2020-04-29:::2020-06-12:::2020-07-20:::2020-07-07:::2020-07-28:::2020-07-31:::2020-09-09:::2020-09-25:::2020-10-08:::2020-11-12"}], ["labeled_dependency", 0, 22, "1751158", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "company_id", "all_relations": ""}]]}
98+
{
99+
"name": "NIKE, Inc.",
100+
"sic": "RUBBER & PLASTICS FOOTWEAR [3021]",
101+
"sic_code": "3021",
102+
"irs_number": "930584541",
103+
"fiscal_year_end": "531",
104+
"state_location": "OR",
105+
"state_incorporation": "OR",
106+
"business_street": "ONE BOWERMAN DR",
107+
"business_city": "BEAVERTON",
108+
"business_state": "OR",
109+
"business_zip": "97005-6453",
110+
"business_phone": "5036713173",
111+
"former_name": "NIKE INC",
112+
"former_name_date": "19920703",
113+
"date": "2022-01-06",
114+
"company_id": "320187"
115+
}
119116
```
120117

121118
{:.model-param}

0 commit comments

Comments
 (0)