ThinkDeepTech
diff --git a/‎.circleci/config.yml
Lines changed: 4 additions & 4 deletions b/‎.circleci/config.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎package.json
Lines changed: 1 addition & 1 deletion b/‎package.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/data-collector/package.json
Lines changed: 4 additions & 0 deletions b/‎packages/data-collector/package.json
Lines changed: 4 additions & 0 deletions
diff --git a/‎packages/data-collector/src/data-collector.js
Lines changed: 89 additions & 75 deletions b/‎packages/data-collector/src/data-collector.js
Lines changed: 89 additions & 75 deletions
@@ -417,9 +417,9 @@ jobs:
             namespace: "production"
             release-name: "v1"
             atomic: true
-            values-to-override: 'gateway.service.certificateId=$PROD_MICROSERVICE_GATEWAY_CERTIFICATE_ID,subscription.service.certificateId=$PROD_MICROSERVICE_SUBSCRIPTION_CERTIFICATE_ID,auth.audience=$PREDECOS_AUTH_AUDIENCE,auth.jwksUri=$PREDECOS_AUTH_JWKS_URI,auth.issuer=$PREDECOS_AUTH_ISSUER,analysis.neo4j.neo4j.username=$PROD_MICROSERVICE_ANALYSIS_NEO4J_USERNAME,analysis.neo4j.neo4j.password=$PROD_MICROSERVICE_ANALYSIS_NEO4J_PASSWORD,analysis.neo4j.config.dbms\.default_database=$PROD_MICROSERVICE_ANALYSIS_NEO4J_DATABASE,analysis.neo4j.config.dbms\.default_advertised_address=$PROD_MICROSERVICE_ANALYSIS_NEO4J_ADVERTISED_ADDRESS,collection.replicas=3,collection.twitter.bearer=$PROD_PREDECOS_TWITTER_BEARER,collection.data.collector.image=thinkdeeptech/data-collector:latest,replicas=1,global.docker.secretName=docker-secret,global.nodeEnv=production'
+            values-to-override: "gateway.service.certificateId=$PROD_MICROSERVICE_GATEWAY_CERTIFICATE_ID,subscription.service.certificateId=$PROD_MICROSERVICE_SUBSCRIPTION_CERTIFICATE_ID,auth.audience=$PREDECOS_AUTH_AUDIENCE,auth.jwksUri=$PREDECOS_AUTH_JWKS_URI,auth.issuer=$PREDECOS_AUTH_ISSUER,analysis.neo4j.neo4j.username=$PROD_MICROSERVICE_ANALYSIS_NEO4J_USERNAME,analysis.neo4j.neo4j.password=$PROD_MICROSERVICE_ANALYSIS_NEO4J_PASSWORD,analysis.neo4j.services.neo4j.enabled=true,analysis.neo4j.services.neo4j.annotations.service\\.beta\\.kubernetes\\.io/do-loadbalancer-certificate-id=$PROD_ANALYSIS_NEO4J_LOADBALANCER_CERTIFICATE_ID,analysis.neo4j.config.dbms\\.default_database=$PROD_MICROSERVICE_ANALYSIS_NEO4J_DATABASE,analysis.neo4j.config.dbms\\.default_advertised_address=$PROD_MICROSERVICE_ANALYSIS_NEO4J_ADVERTISED_ADDRESS,collection.replicas=3,collection.twitter.bearer=$PROD_PREDECOS_TWITTER_BEARER,collection.data.collector.image=thinkdeeptech/data-collector:latest,replicas=1,global.docker.secretName=docker-secret,global.nodeEnv=production"
             update-repositories: true
-            timeout: "480s"
+            timeout: "960s"
             wait: true
 
 
@@ -465,9 +465,9 @@ jobs:
             namespace: "development"
             release-name: "v1"
             atomic: true
-            values-to-override: 'gateway.service.certificateId=$DEV_MICROSERVICE_GATEWAY_CERTIFICATE_ID,subscription.service.certificateId=$DEV_MICROSERVICE_SUBSCRIPTION_CERTIFICATE_ID,auth.audience=$PREDECOS_TEST_AUTH_AUDIENCE,auth.jwksUri=$PREDECOS_TEST_AUTH_JWKS_URI,auth.issuer=$PREDECOS_TEST_AUTH_ISSUER,collection.twitter.bearer=$DEV_PREDECOS_TWITTER_BEARER,replicas=1,global.docker.secretName=docker-secret,analysis.container.image=thinkdeeptech/deep-microservice-analysis-dev:latest,analysis.neo4j.neo4j.username=$DEV_MICROSERVICE_ANALYSIS_NEO4J_USERNAME,analysis.neo4j.neo4j.password=$DEV_MICROSERVICE_ANALYSIS_NEO4J_PASSWORD,analysis.neo4j.services.neo4j.enabled=true,analysis.neo4j.config.dbms\.default_database=$DEV_MICROSERVICE_ANALYSIS_NEO4J_DATABASE,analysis.neo4j.config.dbms\.default_advertised_address=$DEV_MICROSERVICE_ANALYSIS_NEO4J_ADVERTISED_ADDRESS,collection.replicas=3,collection.container.image=thinkdeeptech/deep-microservice-collection-dev:latest,collection.data.collector.image=thinkdeeptech/data-collector-dev:latest,configuration.container.image=thinkdeeptech/deep-microservice-configuration-dev:latest,gateway.container.image=thinkdeeptech/deep-microservice-gateway-dev:latest,subscription.container.image=thinkdeeptech/deep-microservice-subscription-dev:latest,global.nodeEnv=development'
+            values-to-override: "gateway.service.certificateId=$DEV_MICROSERVICE_GATEWAY_CERTIFICATE_ID,subscription.service.certificateId=$DEV_MICROSERVICE_SUBSCRIPTION_CERTIFICATE_ID,auth.audience=$PREDECOS_TEST_AUTH_AUDIENCE,auth.jwksUri=$PREDECOS_TEST_AUTH_JWKS_URI,auth.issuer=$PREDECOS_TEST_AUTH_ISSUER,collection.twitter.bearer=$DEV_PREDECOS_TWITTER_BEARER,replicas=1,global.docker.secretName=docker-secret,analysis.container.image=thinkdeeptech/deep-microservice-analysis-dev:latest,analysis.neo4j.neo4j.username=$DEV_MICROSERVICE_ANALYSIS_NEO4J_USERNAME,analysis.neo4j.neo4j.password=$DEV_MICROSERVICE_ANALYSIS_NEO4J_PASSWORD,analysis.neo4j.services.neo4j.enabled=true,analysis.neo4j.services.neo4j.annotations.service\\.beta\\.kubernetes\\.io/do-loadbalancer-certificate-id=$DEV_ANALYSIS_NEO4J_LOADBALANCER_CERTIFICATE_ID,analysis.neo4j.config.dbms\\.default_database=$DEV_MICROSERVICE_ANALYSIS_NEO4J_DATABASE,analysis.neo4j.config.dbms\\.default_advertised_address=$DEV_MICROSERVICE_ANALYSIS_NEO4J_ADVERTISED_ADDRESS,collection.replicas=3,collection.container.image=thinkdeeptech/deep-microservice-collection-dev:latest,collection.data.collector.image=thinkdeeptech/data-collector-dev:latest,configuration.container.image=thinkdeeptech/deep-microservice-configuration-dev:latest,gateway.container.image=thinkdeeptech/deep-microservice-gateway-dev:latest,subscription.container.image=thinkdeeptech/deep-microservice-subscription-dev:latest,global.nodeEnv=development"
             update-repositories: true
-            timeout: "480s"
+            timeout: "960s"
             wait: true
 
     deploy_ui_and_publish_coverage:
 
@@ -5,7 +5,7 @@
     "setup": "yarn run clean && yarn install",
     "lint": "eslint --fix 'packages/**/*.js'",
     "prettier": "prettier --write 'packages/**/*.js'",
-    "clean": "lerna exec --parallel -- rm -rf ./node_modules && rm -rf ./node_modules && rm yarn.lock",
+    "clean": "lerna exec --parallel -- rm -rf ./node_modules && rm -rf ./node_modules && rm -f yarn.lock",
     "build": "lerna run build --parallel",
     "tests": "yarn run tests:unit && yarn run tests:integration && yarn run tests:e2e",
     "tests:unit": "lerna run tests:unit",
 
@@ -11,6 +11,7 @@
   "devDependencies": {
     "c8": "^7.11.3",
     "chai": "^4.3.4",
+    "chai-as-promised": "^7.1.1",
     "concat-stream": "^2.0.0",
     "mocha": "^9.1.3",
     "sinon-chai": "^3.7.0"
@@ -19,10 +20,13 @@
     "@thinkdeep/attach-exit-handler": "^1.0.0",
     "@thinkdeep/model": "file:./../model",
     "@thinkdeep/util": "file:./../util",
+    "axios": "^0.27.2",
     "commander": "^9.0.0",
+    "duck-duck-scrape": "^2.2.1",
     "kafkajs": "^2.1.0",
     "log4js": "^6.3.0",
     "moment": "^2.29.4",
+    "robots-parser": "^3.0.0",
     "sinon": "^12.0.1",
     "twitter-api-v2": "^1.11.0"
   },
 
@@ -1,7 +1,8 @@
-import {EconomicEntityFactory, EconomicEntityType} from '@thinkdeep/model';
+import {EconomicEntityFactory, CollectionOperationType} from '@thinkdeep/model';
 import {validString} from '@thinkdeep/util';
 import {Client} from './client.js';
 import {Command, Option} from 'commander';
+import {DataScraper} from './data-scraper.js';
 import {Kafka} from 'kafkajs';
 import log4js from 'log4js';
 import moment from 'moment';
@@ -20,121 +21,103 @@ try {
 
   program.addOption(
     new Option(
-      '-n, --entity-name <entity name>',
-      'Specify the name of the economic entity for which the operation will be performed.'
+      '-e, --economic-entity <economic entity>',
+      `Specify the economic entity (i.e, '{ "name": "Google", "type": "BUSINESS"}').`
     )
   );
 
   program.addOption(
     new Option(
-      '-t, --entity-type <entity type>',
-      'Specify the type of the economic entity for which the operation will be performed.'
-    ).choices(['BUSINESS'])
+      '-o, --operation-type <operation type>',
+      'Specify the type of data collection operation you would like to execute.'
+    ).choices(CollectionOperationType.types)
   );
 
   program.addOption(
     new Option(
-      '-o, --operation-type <operation type>',
-      'Specify the type of data collection operation you would like to execute.'
-    ).choices(['fetch-tweets'])
+      '-l, --limit [limit]',
+      'Specify the limit associated with the operation.'
+    ).default(10, 'Defaults to 10.')
   );
 
   program.addOption(
     new Option(
-      '-m, --num-tweets [num tweets]',
-      'Specify the number of tweets to be fetched at once.'
-    ).default(10, 'The default number to fetch.')
+      '-m, --mock-data <mock data>',
+      'Trigger mocking of the cli.'
+    ).default({}, 'An empty object')
   );
 
-  program.addOption(new Option('--mock-run', 'Trigger mocking of the cli.'));
-
   program.parse(process.argv);
 
   const options = program.opts();
 
+  const economicEntity = EconomicEntityFactory.get(
+    JSON.parse(options.economicEntity)
+  );
+
   if (!validString(options.operationType))
     throw new Error('Operation type is required');
 
-  if (!validString(options.entityName))
-    throw new Error(`Entity name is required`);
-
-  if (!validString(options.entityType))
-    throw new Error(`Entity type is required`);
-
-  if (!EconomicEntityType.valid(options.entityType))
-    throw new Error(`Entity type ${options.entityType} is invalid.`);
+  const currentUtcDateTime = moment().utc().format();
 
-  const economicEntity = EconomicEntityFactory.economicEntity({
-    name: options.entityName,
-    type: options.entityType,
-  });
+  let kafkaClient;
+  let twitterClient;
+  if (!options.mockData || Object.keys(options.mockData).length <= 0) {
+    logger.info(`Creating kafka client.`);
+    kafkaClient = new Kafka({
+      clientId: 'collect-data',
+      brokers: [
+        `${process.env.PREDECOS_KAFKA_HOST}:${process.env.PREDECOS_KAFKA_PORT}`,
+      ],
+    });
+
+    twitterClient = new TwitterApi(process.env.PREDECOS_TWITTER_BEARER)
+      .readOnly;
+  } else {
+    logger.info(`Creating mock kafka client.`);
+    kafkaClient = {
+      admin: sinon.stub().returns({
+        connect: sinon.stub(),
+        createTopics: sinon.stub(),
+        disconnect: sinon.stub(),
+      }),
+      producer: sinon.stub().returns({
+        connect: sinon.stub(),
+        send: sinon.stub(),
+        disconnect: sinon.stub(),
+      }),
+    };
+
+    twitterClient = {
+      v2: {
+        get: sinon.stub().returns({
+          data: JSON.parse(options.mockData),
+        }),
+      },
+    };
+  }
 
-  const currentUtcDateTime = moment().utc().format();
+  const collectDataClient = new Client(twitterClient, kafkaClient, logger);
 
   switch (options.operationType) {
-    case 'fetch-tweets': {
+    case CollectionOperationType.FetchTweets: {
       logger.info('Fetching tweets.');
-      let twitterClient;
-      let kafkaClient;
-      if (!options.mockRun) {
-        twitterClient = new TwitterApi(process.env.PREDECOS_TWITTER_BEARER)
-          .readOnly;
-
-        kafkaClient = new Kafka({
-          clientId: 'collect-data',
-          brokers: [
-            `${process.env.PREDECOS_KAFKA_HOST}:${process.env.PREDECOS_KAFKA_PORT}`,
-          ],
-        });
-      } else {
-        twitterClient = {
-          v2: {
-            get: sinon.stub().returns({
-              data: [
-                {
-                  text: 'tweet 1',
-                },
-                {
-                  text: 'tweet 2',
-                },
-                {
-                  text: 'tweet 3',
-                },
-              ],
-            }),
-          },
-        };
-        kafkaClient = {
-          admin: sinon.stub().returns({
-            connect: sinon.stub(),
-            createTopics: sinon.stub(),
-            disconnect: sinon.stub(),
-          }),
-          producer: sinon.stub().returns({
-            connect: sinon.stub(),
-            send: sinon.stub(),
-            disconnect: sinon.stub(),
-          }),
-        };
-      }
-
-      const collectDataClient = new Client(twitterClient, kafkaClient, logger);
 
       (async () => {
         logger.info('Connecting to data collection client.');
         await collectDataClient.connect();
 
         const recentTweets = await collectDataClient.fetchRecentTweets({
           query: `${options.entityName} lang:en -is:retweet`,
-          max_results: options.numTweets,
+          max_results: options.limit,
         });
         logger.debug(
           `Retrieved the following tweets: ${JSON.stringify(recentTweets)}`
         );
 
         const data = {
           utcDateTime: currentUtcDateTime,
-          economicEntity: economicEntity.toObject(),
+          economicEntity,
           tweets: recentTweets,
         };
 
@@ -145,6 +128,37 @@ try {
 
       break;
     }
+    case CollectionOperationType.ScrapeData: {
+      (async () => {
+        logger.info('Connecting to data collection client.');
+        await collectDataClient.connect();
+
+        logger.info(
+          `Scraping data for ${economicEntity.type} ${economicEntity.name}.`
+        );
+        const scraper =
+          !options.mockData || Object.keys(options.mockData).length <= 0
+            ? new DataScraper(logger)
+            : sinon.createStubInstance(DataScraper);
+
+        const scrapedData = await scraper.scrapeData(economicEntity);
+
+        const data = {
+          utcDateTime: currentUtcDateTime,
+          economicEntity,
+          data: scrapedData,
+        };
+
+        await collectDataClient.emitEvent('DATA_SCRAPED', data);
+      })();
+
+      break;
+    }
+    default: {
+      throw new Error(
+        `The specified operation ${options.operationType} isn't yet supported.`
+      );
+    }
   }
 } catch (e) {
   logger.error(e.message.toString());