Adding output format param to save_annotated and adding new method save_text_extraction.

rp-iago1460 · rp-iago1460 · commit 4abd1ded6bb0 · 2021-11-04T17:17:42.000+01:00
Updated Readme file.
diff --git a/README.md b/README.md
@@ -163,17 +163,49 @@ for ticker in references.tickers:
         print(ticker)
 ```
 
-### Text Analytics
+## Text Analytics
 
 Analyse your own content using RavenPack’s proprietary NLP technology.
 
-The API for analyzing your internal content is still in beta and may change in the future. You can request an early
-access and [see an example of usage here](ravenpackapi/examples/text_extraction.py).
+The API for analyzing your internal content is still in beta and may change in the future. You can request an early access and [see an example of usage here](ravenpackapi/examples/text_analytics_example.py).
+
+### Uploading a file
+Upload a file to the system. In order to successfully have your files analized by RavenPack's text analytics platform, you need to perform the following method:
+
+```python
+f = api.upload.file("_orig.doc")
+```
+
+Different options and features are available when uploading a file for development. For more information, please check the user guide found on RavenPack's platform.
+
+### Getting analytics
+Saves analytics for the processed files. You can choose to retrieve analytics in JSON-Lines or CSV format:
+
+```python
+f.save_analytics("_analytics.json")
+```
+
+### Getting normalized documents
+RavenPack’s Text Analytics provides normalized content in JSON format, along with text categorization, tables in HTML format and metadata derived from the original document.
+
+```python
+f.save_text_extraction("_text_extraction.json")
+```
+
+It is also possible to obtain the normalized content in JSON format, along with annotations of entities, events and analytics derived from the content.
+
+```python
+f.save_annotated("_annotated_document.json", output_format='application/json')
+```
+
+For further details, please [see the example of usage exposed here](ravenpackapi/examples/text_analytics_example.py).
+
+
+
 
 ### Accessing the low-level requests
 
-RavenPack API wrapper is using the [requests library](https://2.python-requests.org) to do HTTPS requests, you can set
-common requests parameters to all the outbound calls by setting the `common_request_params` attribute.
+RavenPack API wrapper is using the [requests library](https://2.python-requests.org) to do HTTPS requests, you can set common requests parameters to all the outbound calls by setting the `common_request_params` attribute.
 
 For example, to disable HTTPS certificate verification and to setup your internal proxy:
 
@@ -189,4 +221,4 @@ api.common_request_params.update(
 # use the api to do requests
 ```
 
-PS. For setting your internal proxies, requests will honor the HTTPS_PROXY environment variable.
+PS. For setting your internal proxies, requests will honor the HTTPS_PROXY environment variable.
diff --git a/ravenpackapi/examples/text_analytics_example.py b/ravenpackapi/examples/text_analytics_example.py
@@ -8,10 +8,15 @@
     print(f)
 
 # upload a file to access the analytics
-f = api.upload.file("_orig.doc",
-                    # upload_mode="RPXML"
-                    # properties={"primary_entity": "RavenPack"}
-                    )
+f = api.upload.file("_orig.doc")
+#f = api.upload.file("_orig.doc",
+                    # upload_mode="RPJSON"
+                    # properties={
+                    #   "primary_entity": "RavenPack",
+                    #   "provider_document_id": "<YOUR_DOCUMENT_ID>"
+                    #   "extractor": "PDF_TABLE_EXTRACTOR"
+                    #   }
+                    #)
 
 # you can also upload from a publicly available URL
 # f = api.upload.file("demo.html",
@@ -24,19 +29,26 @@
 # f = api.upload.get('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
 
 # get back the analytics found in the document
-f.save_analytics("_analytics.json")
+# f.save_analytics("_analytics.csv", output_format='text/csv')
+f.save_analytics("_analytics.json", output_format='application/json')
 
 # the annotated version
-f.save_annotated("us30orig.xml")
+# f.save_annotated("_annotated_document.xml", output_format='application/xml')
+f.save_annotated("_annotated_document.json", output_format='application/json')
 
 # or the original
 f.save_original("_orig.doc")
 
-# show the extracted text
+# show or save the extracted text
 # extracted_text = f.text_extraction()
+f.save_text_extraction("_text_extraction.json", output_format='application/json')
 
 # given a file we can set tags
 # f.set_metadata(tags=['file tag'])
+# f.get_metadata()
+
+# return the process status of the file
+f.get_status()
 
 # ... or delete it
 # f.delete()
diff --git a/ravenpackapi/upload/models.py b/ravenpackapi/upload/models.py
@@ -102,11 +102,15 @@ def get_analytics(self, output_format='application/json'):
             return response.text
 
     @api_method
-    def save_annotated(self, filename):
+    def save_annotated(self, filename, output_format='application/xml'):
         self.wait_for_completion()
         response = retry_on_too_early(self.api.request,
                                       '%s/files/%s/annotated' % (self.api._UPLOAD_BASE_URL, self.file_id),
-                                      stream=True)
+                                      stream=True,
+                                      headers=dict(
+                                          Accept=output_format,
+                                          **self.api.headers
+                                      ))
         with open(filename, 'wb') as f:
             for chunk in response.iter_content(chunk_size=self.api._CHUNK_SIZE):
                 f.write(chunk)
@@ -165,6 +169,18 @@ def text_extraction(self, output_format="text/csv"):
                                     )
         return response.text
 
+    @api_method
+    def save_text_extraction(self, filename, output_format='application/json'):
+        headers = self.api.headers.copy()
+        headers["Content-type"] = output_format
+        response = retry_on_too_early(self.api.request,
+                                      '%s/files/%s/text-extraction' % (self.api._UPLOAD_BASE_URL, self.file_id),
+                                      stream=True,
+                                      headers=headers)
+        with open(filename, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=self.api._CHUNK_SIZE):
+                f.write(chunk)
+
 
 class Folder(object):
     """ A Folder containing files """