diff --git a/diffbot/diffbot_import.ipynb b/diffbot/diffbot_import.ipynb index 643b81e..9218f0b 100644 --- a/diffbot/diffbot_import.ipynb +++ b/diffbot/diffbot_import.ipynb @@ -4,7 +4,7 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyPy5Yo6ryKsOk4HajcMMpTe", + "authorship_tag": "ABX9TyNr0ll91B76jYwbS8sA0Egd", "include_colab_link": true }, "kernelspec": { @@ -36,28 +36,28 @@ "base_uri": "https://localhost:8080/" }, "id": "RQ5c2LRxUknp", - "outputId": "e8887f79-7cde-4044-a879-5351a54c98ed" + "outputId": "3cc2a4ee-e3dd-4ac8-87fd-d443e1b58974" }, - "execution_count": 79, + "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting neo4j\n", - " Downloading neo4j-5.23.1-py3-none-any.whl.metadata (5.7 kB)\n", + " Downloading neo4j-5.24.0-py3-none-any.whl.metadata (5.7 kB)\n", "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from neo4j) (2024.1)\n", - "Downloading neo4j-5.23.1-py3-none-any.whl (293 kB)\n", - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/293.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m293.6/293.6 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "Downloading neo4j-5.24.0-py3-none-any.whl (294 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.6/294.6 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: neo4j\n", - "Successfully installed neo4j-5.23.1\n" + "Successfully installed neo4j-5.24.0\n" ] } ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 40, "metadata": { "id": "9cR3kuPRCmXU" }, @@ -78,19 +78,19 @@ "base_uri": "https://localhost:8080/" }, "id": "hraRRIc_Cvt6", - "outputId": "4af1e9da-233a-43b6-c7cc-2417467c2e6b" + "outputId": "57cf1480-1203-4812-ca0f-0f18c74c5a11" }, - "execution_count": 7, + "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "dict_keys(['twitterUri', 'nbActiveEmployeeEdges', 'suppliers', 'type', 'naceClassification', 'iSicClassification', 'allNames', 'isAcquired', 'revenue', 'competitors', 'emailAddresses', 'yearlyRevenues', 'logo', 'id', 'stock', 'nbOrigins', 'sicClassification', 'foundingDate', 'image', 'images', 'wikipediaUri', 'secForms', 'irsEmployerIdentificationNumbers', 'naicsClassification2017', 'diffbotUri', 'nbIncomingEdges', 'nbEmployeesMin', 'ipo', 'employeeCategories', 'wikipediaPageviewsLastQuarterGrowth', 'capitalization', 'wikipediaPageviewsLastYear', 'angellistUri', 'name', 'legalEntities', 'nbEmployeesMax', 'totalInvestment', 'allOriginHashes', 'linkedInUri', 'naicsClassification', 'nbEmployees', 'monthlyTraffic', 'githubUri', 'isDissolved', 'companiesHouseIds', 'importance', 'origin', 'description', 'ultimateParent', 'homepageUri', 'founders', 'ceo', 'investments', 'phoneNumbers', 'diffbotClassification', 'blogUri', 'descriptors', 'wikipediaPageviewsLastYearGrowth', 'partnerships', 'isNonProfit', 'japaneseCorporateNumbers', 'origins', 'isPublic', 'wikipediaPageviews', 'categories', 'customers', 'crawlTimestamp', 'nbUniqueInvestors', 'facebookUri', 'secCentralIndexKeys', 'summary', 'types', 'technographics', 'boardMembers', 'wikipediaPageviewsLastQuarter', 'fullName', 'allUris', 'remoteScore', 'monthlyTrafficGrowth', 'quarterlyRevenues', 'nbLocations', 'crunchbaseUri', 'googlePlusUri', 'industries', 'allDescriptions', 'location', 'locations', 'subsidiaries', 'wikipediaPageviewsGrowth', 'vatIdentificationNumbers'])" + "dict_keys(['twitterUri', 'nbActiveEmployeeEdges', 'suppliers', 'type', 'naceClassification', 'iSicClassification', 'allNames', 'isAcquired', 'revenue', 'competitors', 'emailAddresses', 'yearlyRevenues', 'logo', 'id', 'stock', 'nbOrigins', 'sicClassification', 'foundingDate', 'image', 'images', 'wikipediaUri', 'secForms', 'irsEmployerIdentificationNumbers', 'naicsClassification2017', 'diffbotUri', 'nbIncomingEdges', 'nbEmployeesMin', 'ipo', 'employeeCategories', 'wikipediaPageviewsLastQuarterGrowth', 'capitalization', 'wikipediaPageviewsLastYear', 'angellistUri', 'name', 'legalEntities', 'nbEmployeesMax', 'totalInvestment', 'allOriginHashes', 'linkedInUri', 'naicsClassification', 'nbEmployees', 'monthlyTraffic', 'githubUri', 'isDissolved', 'companiesHouseIds', 'importance', 'origin', 'description', 'homepageUri', 'founders', 'ceo', 'investments', 'phoneNumbers', 'diffbotClassification', 'blogUri', 'descriptors', 'wikipediaPageviewsLastYearGrowth', 'partnerships', 'isNonProfit', 'japaneseCorporateNumbers', 'origins', 'isPublic', 'wikipediaPageviews', 'categories', 'customers', 'crawlTimestamp', 'nbUniqueInvestors', 'facebookUri', 'secCentralIndexKeys', 'summary', 'types', 'technographics', 'boardMembers', 'wikipediaPageviewsLastQuarter', 'fullName', 'allUris', 'remoteScore', 'monthlyTrafficGrowth', 'quarterlyRevenues', 'nbLocations', 'crunchbaseUri', 'googlePlusUri', 'industries', 'allDescriptions', 'location', 'locations', 'subsidiaries', 'wikipediaPageviewsGrowth', 'vatIdentificationNumbers'])" ] }, "metadata": {}, - "execution_count": 7 + "execution_count": 36 } ] }, @@ -107,7 +107,7 @@ "id": "9gcxoW44ELwY", "outputId": "15fa9544-9c8e-4653-d59c-a99873fba104" }, - "execution_count": 8, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -210,122 +210,79 @@ { "cell_type": "code", "source": [ - "data['data'][0]['entity']['boardMembers']" + "data['data'][0]['entity']['yearlyRevenues']" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qAWzw6nBFUlY", - "outputId": "46b252bc-5743-4df3-a34d-0c805c924afe" + "outputId": "9f060429-fa9d-4e44-d0c7-cf43da55db12" }, - "execution_count": 97, + "execution_count": 90, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "[{'summary': 'Chairman at Malwarebytes',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0SttAc%3ANu.d9U%7CB%5DpM08.%7DZnIa6%40_%3Bv2FpDLhawxEr-%5BuTpJZ%40-r%3Ec2Jh9e%3EN7%5Bq8h%3EQgCkx%3AT%3AEQ8W%7EN1%3BSgL%7E1Kw%3Ai.Vu4',\n", - " 'types': ['Person'],\n", - " 'name': 'Brooke Seawell',\n", - " 'diffbotUri': 'http://diffbot.com/entity/ELRfrbAB7Np21klMnLkKA9g',\n", - " 'targetDiffbotId': 'ELRfrbAB7Np21klMnLkKA9g',\n", - " 'type': 'Person'},\n", - " {'summary': 'Founder at Tensilica',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0SttAc%3ANu.d9U%7CB%5DpM08.%7DZnIa6%40_%3Bv2FpDLhawxEr-%5BuTpJZ%40-r%3Ec2Jh9e%3EN7%5Bn6l9PiCixvY98YUU5%3Bb7YkF%5EvL%7DB%60.Pw4',\n", - " 'types': ['Person', 'PersonInvestor'],\n", - " 'name': 'Harvey Jones',\n", - " 'diffbotUri': 'http://diffbot.com/entity/Eo_nbDUUnPv2TQtExGLeS9A',\n", - " 'targetDiffbotId': 'Eo_nbDUUnPv2TQtExGLeS9A',\n", - " 'type': 'Person'},\n", - " {'summary': 'American lawyer and chairman of multiple boards',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0St%7BFq9RnE-0MoFZ1.5%3CZ1.d9eHHe7%5ErI37m6EhMFFB%5BCtE9vUNeR%7DTg1%3C2AX7Ys%3Be-Mc5W%5E2XvE-%3CTf%3A.Oh2R%7DF%60.C_o',\n", - " 'types': ['Person'],\n", - " 'name': 'Stephen Neal',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EEZ4Qn6OSNki7_FIiUePz-w',\n", - " 'targetDiffbotId': 'EEZ4Qn6OSNki7_FIiUePz-w',\n", - " 'type': 'Person'},\n", - " {'summary': 'Board Member at NVIDIA',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0SttAc%3ANu.d9U%7CB%5DpM08.%7DZnIa6%40_%3Bv2FpDLhawxEr-%5BuTpJZ%40-r%3Ec2Jh9e%3EN7Zv%3De%3DOdHkxHsG%5DUSS6R51c%7E%3EaF%5En%3Fq%3AJ%3FRkFJ%5DIsx%3C.%3Aan',\n", - " 'types': ['Person'],\n", - " 'name': 'Mark Perry',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EM2NPkTaiPbmawxrZmxpoLw',\n", - " 'targetDiffbotId': 'EM2NPkTaiPbmawxrZmxpoLw',\n", - " 'type': 'Person'},\n", - " {'summary': 'Chairman at USC Stevens',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0Stn8p.%5DjEp9G.Bd%7BkpXn%7BP8R%7E-E%5BY%7D-Z%3C9L-j%3Czg-d%5CsQCXu0q7P%60zN_%3Cb4E.A%5Dm',\n", - " 'types': ['Person', 'PersonInvestor'],\n", - " 'name': 'Mark A. Stevens',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EI1WkamqPOR2rAFCPEabfdQ',\n", - " 'targetDiffbotId': 'EI1WkamqPOR2rAFCPEabfdQ',\n", - " 'type': 'Person'},\n", - " {'summary': 'Canadian businessman',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0St0DnBJf.x0KwLZrUn.%5B%3CR0Aa4Hh%3B%5Bv738ZqOr7U%3FEvtr%3Ew%5D%7ERu%40Y%5DVl%3F%5E2K-%3Cn%3Aj7NdCizEsA_.8_l',\n", - " 'types': ['Person', 'PersonInvestor'],\n", - " 'name': 'Rob Burgess',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EPOasdwHqMUCsxskAOAo3-A',\n", - " 'targetDiffbotId': 'EPOasdwHqMUCsxskAOAo3-A',\n", - " 'type': 'Person'},\n", - " {'summary': 'American businessperson',\n", - " 'types': ['Person'],\n", - " 'name': 'Dawn Hudson',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EOY5C5IprPeiOL5U2ZB1tOQ',\n", - " 'targetDiffbotId': 'EOY5C5IprPeiOL5U2ZB1tOQ',\n", - " 'type': 'Person'},\n", - " {'summary': 'American physicist',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0SttAc%3ANu.d9U%7CB%5DpM08.%7DZnIa6%40_%3Bv2FpDLhawxEr-%5BuTpJZ%40-r%3Ec2Jh9e%3EN7%5Br8h5LjHlx0cBQeAv%3E7q7Qd%603tI%2FEt.Pw4',\n", - " 'types': ['Person'],\n", - " 'name': 'Persis Drell',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EqT51RzkRPg-dtyvJOwl2KA',\n", - " 'targetDiffbotId': 'EqT51RzkRPg-dtyvJOwl2KA',\n", - " 'type': 'Person'},\n", - " {'summary': 'American businessperson',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0Stn8p.%5DjEp9G.Bd%7BkATr%2Fn7Sz-A%7D%5D%2F-Z%3AXL-DgOE-DXtoH7n0O9M%5EzN_%3Cb4E.A%5Dm',\n", - " 'types': ['Person'],\n", - " 'name': 'Melissa Lora',\n", - " 'diffbotUri': 'http://diffbot.com/entity/ETMFlTdjcNTiRq1JzJla3hA',\n", - " 'targetDiffbotId': 'ETMFlTdjcNTiRq1JzJla3hA',\n", - " 'type': 'Person'},\n", - " {'summary': 'American venture capitalist',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0SttAc%3ANu.d9U%7CB%5DpM08.%7DZnIa6%40_%3Bv2FpDLhawxEr-%5BuTpJZ%40-r%3Ec2Jh9e%3EN7Zv%3De%3DShChxHSFYTWu53%2FPja_%60C%3C%3EYSjKl0LIk4MzKY.%3Aan',\n", - " 'types': ['Person'],\n", - " 'name': 'Tench Coxe',\n", - " 'diffbotUri': 'http://diffbot.com/entity/Eyd_MZXd8MaK3GJDxaNrqVA',\n", - " 'targetDiffbotId': 'Eyd_MZXd8MaK3GJDxaNrqVA',\n", - " 'type': 'Person'},\n", - " {'summary': 'Chairman at Makena Capital Management',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0SttAc%3ANu.d9U%7CB%5DpM08.%7DZnIa6%40_%3Bv2FpDLhawxEr-%5BuTpJZ%40-r%3Ec2Jh9e%3EO%7E8VhLs%7D%5C%7Cedy%5BrF%5DDbp',\n", - " 'types': ['Person'],\n", - " 'name': 'Michael McCaffery',\n", - " 'diffbotUri': 'http://diffbot.com/entity/EpaU5t6W4OiK0NeHvnP61Eg',\n", - " 'targetDiffbotId': 'EpaU5t6W4OiK0NeHvnP61Eg',\n", - " 'type': 'Person'},\n", - " {'summary': 'CIO at Northwestern Mutual',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7P0Stn8p.%5DjEp9G.Bd%7BkB6t%7Em2%2Fy-B%7DX%2F-ZhZM-j5%7EC-j%7DrRt9r%2FR81%7CzN_%3Cb4E.A%5Dm',\n", - " 'types': ['Person'],\n", - " 'name': 'Aarti Shah',\n", - " 'diffbotUri': 'http://diffbot.com/entity/E3DxhrvJRMuOeoyxxeeTU0A',\n", - " 'targetDiffbotId': 'E3DxhrvJRMuOeoyxxeeTU0A',\n", - " 'type': 'Person'},\n", - " {'summary': 'Taiwanese-American entrepreneur and businessman; founder and CEO of Nvidia',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7guSXzAoBWu.x0KwLZrUn.%5B%3CR0Aa4Hwygr9m6W%3Exl5G%60BkxmY%3A%5C%3FRq%7BB%40Bh%3FXAX8Zn7d%3E.N2J',\n", - " 'types': ['Person'],\n", - " 'name': 'Jensen Huang',\n", - " 'diffbotUri': 'http://diffbot.com/entity/Erp3EYknCP8q0-FX6HGkdqw',\n", - " 'targetDiffbotId': 'Erp3EYknCP8q0-FX6HGkdqw',\n", - " 'type': 'Person'},\n", - " {'summary': 'Board Member from United States',\n", - " 'image': 'https://kg.diffbot.com/image/api/get?fetch=yes&url=g%3Cj7guSXzAoBWu.x0KwLZrUn.%5B%3CR0Aa4Hwygr9m6W%3Exl5G%60BkxmYDV%3ALR.Y_B%5CAkx%2Cew0G7Yz1wP%5Bwbp%28S%40%5Eu%3Ea1%29.Dkx',\n", - " 'types': ['Person'],\n", - " 'name': 'John Dabiri',\n", - " 'diffbotUri': 'http://diffbot.com/entity/ECtOA5aKHPLmO8MqT8eeTmw',\n", - " 'targetDiffbotId': 'ECtOA5aKHPLmO8MqT8eeTmw',\n", - " 'type': 'Person'}]" + "[{'revenue': {'currency': 'USD', 'value': 60922000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2024},\n", + " {'revenue': {'currency': 'USD', 'value': 26974000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2023},\n", + " {'revenue': {'currency': 'USD', 'value': 26914000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2022},\n", + " {'revenue': {'currency': 'USD', 'value': 16675000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2021},\n", + " {'revenue': {'currency': 'USD', 'value': 10918000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2020},\n", + " {'revenue': {'currency': 'USD', 'value': 11716000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2019},\n", + " {'revenue': {'currency': 'USD', 'value': 9714000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2018},\n", + " {'revenue': {'currency': 'USD', 'value': 9714000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2017},\n", + " {'revenue': {'currency': 'USD', 'value': 6910000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2016},\n", + " {'revenue': {'currency': 'USD', 'value': 5010000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2015},\n", + " {'revenue': {'currency': 'USD', 'value': 4682000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2014},\n", + " {'revenue': {'currency': 'USD', 'value': 4130000000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2013},\n", + " {'revenue': {'currency': 'USD', 'value': 4280159000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2012},\n", + " {'revenue': {'currency': 'USD', 'value': 3997930000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2011},\n", + " {'revenue': {'currency': 'USD', 'value': 3543309000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2010},\n", + " {'revenue': {'currency': 'USD', 'value': 3326445000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2009},\n", + " {'revenue': {'currency': 'USD', 'value': 3424859000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2008},\n", + " {'revenue': {'currency': 'USD', 'value': 4097860000.0},\n", + " 'isCurrent': False,\n", + " 'year': 2007}]" ] }, "metadata": {}, - "execution_count": 97 + "execution_count": 90 } ] }, @@ -335,12 +292,13 @@ "used = [\"diffbotUri\", \"name\", \"summary\", \"wikipediaUri\", \"fullName\", \"twitterUri\", \"isAcquired\", \"linkedInUri\", \"githubUri\", \"importance\", \"blogUri\", \"homepageUri\",\n", " \"angellistUri\", \"logo\", \"monthlyTraffic\", \"isNonProfit\", \"nbEmployees\", \"crunchbaseUri\", \"googlePlusUri\", \"description\", \"suppliers\", \"foundingDate\", \"competitors\", \"stock\",\n", " \"isPublic\", \"wikipediaPageviews\", \"nbUniqueInvestors\", \"totalInvestment\", \"isDissolved\", \"facebookUri\", \"founders\", \"partnerships\", \"customers\", \"boardMembers\", \"subsidiaries\",\n", - " \"diffbotClassification\", \"companiesHouseIds\", \"secCentralIndexKeys\", \"ceo\", \"revenue\", \"ipo\", \"capitalization\"]\n", + " \"diffbotClassification\", \"companiesHouseIds\", \"secCentralIndexKeys\", \"ceo\", \"revenue\", \"ipo\", \"capitalization\", \"locations\", \"technographics\", \"foundingDateClean\", \"ipoDate\",\n", + " \"secForms\", \"investments\"]\n", "ignore = [\"origin\", \"type\", \"nbActiveEmployeeEdges\", \"id\", \"naceClassification\", \"nbOrigins\", \"iSicClassification\", \"allNames\", \"emailAddresses\", \"nbIncomingEdges\", \"nbEmployeesMin\",\n", " \"wikipediaPageviewsLastQuarterGrowth\", \"japaneseCorporateNumbers\", \"wikipediaPageviewsLastYear\", \"crawlTimestamp\", \"allUris\", \"remoteScore\", \"monthlyTrafficGrowth\", \"nbLocations\",\n", " \"wikipediaPageviewsGrowth\", \"vatIdentificationNumbers\", \"image\", \"images\", \"nbEmployeesMax\", \"sicClassification\", \"naicsClassification2017\", \"allOriginHashes\", \"phoneNumbers\",\n", " \"wikipediaPageviewsLastYearGrowth\", \"origins\", \"wikipediaPageviewsLastQuarter\", \"allDescriptions\", \"naicsClassification\", \"employeeCategories\", \"categories\", \"types\",\n", - " \"industries\", \"location\", \"irsEmployerIdentificationNumbers\", \"legalEntities\"]\n", + " \"industries\", \"location\", \"irsEmployerIdentificationNumbers\", \"legalEntities\", \"descriptors\"]\n", "available = [key for key in data['data'][0]['entity'].keys() if key not in used and key not in ignore]\n", "print(available)" ], @@ -349,15 +307,15 @@ "base_uri": "https://localhost:8080/" }, "id": "to5LtrRGE9yU", - "outputId": "1a4c68fb-324a-47d8-b5da-64f607a9bc37" + "outputId": "11b685f6-241b-46ef-e628-569ac2709720" }, - "execution_count": 77, + "execution_count": 88, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "['yearlyRevenues', 'secForms', 'ultimateParent', 'investments', 'descriptors', 'technographics', 'quarterlyRevenues', 'locations']\n" + "['yearlyRevenues', 'quarterlyRevenues']\n" ] } ] @@ -375,22 +333,65 @@ "metadata": { "id": "OCiiUmWrUrig" }, - "execution_count": 80, + "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ - "# constraints __Entity__, Classification\n", + "from typing import Optional, Union\n", + "from datetime import datetime\n", + "\n", + "def get_datetime(value: Optional[Union[str, int, float]]) -> datetime:\n", + " if not value:\n", + " return value\n", + " return datetime.fromtimestamp(float(value) / 1000.0)\n", "\n", - "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:__Entity__) REQUIRE n.id IS UNIQUE;\")\n", + "def preprocess_dates(data):\n", + " for entity in data['data']:\n", + " entity['entity']['foundingDateClean'] = get_datetime(entity['entity']['foundingDate']['timestamp'])\n", + " entity['entity']['ipoDate'] = get_datetime(entity['entity']['ipo']['date']['timestamp'])\n", + " for sec in entity['entity']['secForms']:\n", + " sec['dateClean'] = get_datetime(sec['filingDate']['timestamp'])\n", + " for inv in entity['entity']['investments']:\n", + " inv['dateClean'] = get_datetime(inv['date']['timestamp'])\n" + ], + "metadata": { + "id": "sEdler9AbJew" + }, + "execution_count": 80, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "preprocess_dates(data)" + ], + "metadata": { + "id": "zfaAFIiNdtd6" + }, + "execution_count": 81, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Organization) REQUIRE n.id IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE n.id IS UNIQUE;\")\n", "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Classification) REQUIRE n.id IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Place) REQUIRE n.address IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Country) REQUIRE n.id IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Technographic) REQUIRE n.name IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:TechnographicCategory) REQUIRE n.name IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:SecForm) REQUIRE n.filingUrl IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:InvestmentSeries) REQUIRE n.id IS UNIQUE;\")\n", + "driver.execute_query(\"CREATE CONSTRAINT IF NOT EXISTS FOR (n:RevenueYear) REQUIRE n.id IS UNIQUE;\")\n", + "\n", "\n", "query = \"\"\"\n", "UNWIND $data AS row\n", - "MERGE (o:__Entity__ {id: row.diffbotUri})\n", - "SET o:Organization,\n", - " o.name = row.name,\n", + "MERGE (o:Organization {id: row.diffbotUri})\n", + "SET o.name = row.name,\n", " o.summary = row.summary,\n", " o.description = row.description,\n", " o.wikipediaUri = row.wikipediaUri,\n", @@ -415,8 +416,8 @@ " o.monthlyTraffic = toFloat(row.monthlyTraffic),\n", " o.nbEmployees = toInteger(row.nbEmployees),\n", " o.ipoStockExchange = row.ipo.stockExchange,\n", - " //o.ipoDate = CASE WHEN row.ipo IS NOT NULL THEN datetime({epochMillis: toInteger(row.ipo.timestamp)}) ELSE null END,\n", - " //o.foundingDate = CASE WHEN row.foundingDate IS NOT NULL THEN datetime({epochMillis: toInteger(row.foundingDate.timestamp)}) ELSE null END,\n", + " o.ipoDate = row.ipoDate,\n", + " o.foundingDate = row.foundingDateClean,\n", " o.stockTicker = CASE WHEN row.stock IS NOT NULL THEN row.stock.ticker ELSE null END,\n", " o.wikipediaPageviews = toInteger(row.wikipediaPageviews),\n", " o.nbUniqueInvestors = toInteger(row.nbUniqueInvestors),\n", @@ -430,84 +431,144 @@ "CALL {\n", " WITH o, row\n", " UNWIND row.suppliers AS supplier\n", - " MERGE (s:`__Entity__` {id: supplier.diffbotUri})\n", + " MERGE (s:`Organization` {id: supplier.diffbotUri})\n", " ON CREATE SET s.name = supplier.name,\n", " s.summary = supplier.summary\n", " MERGE (o)-[:HAS_SUPPLIER]->(s)\n", - " WITH s, supplier\n", - " CALL apoc.create.addLabels(s, [supplier.type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", "CALL {\n", " WITH o, row\n", " UNWIND row.competitors AS competitor\n", - " MERGE (s:`__Entity__` {id: competitor.diffbotUri})\n", + " MERGE (s:`Organization` {id: competitor.diffbotUri})\n", " ON CREATE SET s.name = competitor.name,\n", " s.summary = competitor.summary\n", " MERGE (o)-[:HAS_COMPETITOR]->(s)\n", - " WITH s, competitor\n", - " CALL apoc.create.addLabels(s, [competitor.type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", "CALL {\n", " WITH o, row\n", " UNWIND row.founders AS founder\n", - " MERGE (s:`__Entity__` {id: founder.diffbotUri})\n", + " MERGE (s:`Person` {id: founder.diffbotUri})\n", " ON CREATE SET s.name = founder.name,\n", " s.summary = founder.summary\n", " MERGE (o)-[:HAS_FOUNDER]->(s)\n", - " WITH s, founder\n", - " CALL apoc.create.addLabels(s, [founder.type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", "CALL {\n", " WITH o, row\n", " UNWIND row.partnerships AS partnership\n", - " MERGE (s:`__Entity__` {id: partnership.diffbotUri})\n", + " MERGE (s:`Organization` {id: partnership.diffbotUri})\n", " ON CREATE SET s.name = partnership.name,\n", " s.summary = partnership.summary\n", " MERGE (o)-[:PARTNERSHIP]-(s)\n", - " WITH s, partnership\n", - " CALL apoc.create.addLabels(s, [partnership.type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", "CALL {\n", " WITH o, row\n", " UNWIND row.customers AS customer\n", - " MERGE (s:`__Entity__` {id: customer.diffbotUri})\n", + " MERGE (s:`Organization` {id: customer.diffbotUri})\n", " ON CREATE SET s.name = customer.name,\n", " s.summary = customer.summary\n", " MERGE (o)-[:HAS_CUSTOMER]->(s)\n", - " WITH s, customer\n", - " CALL apoc.create.addLabels(s, [customer.type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", "CALL {\n", " WITH o, row\n", " UNWIND row.boardMembers AS board_member\n", - " MERGE (s:`__Entity__` {id: board_member.diffbotUri})\n", + " MERGE (s:`Person` {id: board_member.diffbotUri})\n", " ON CREATE SET s.name = board_member.name,\n", " s.summary = board_member.summary\n", " MERGE (o)-[:BOARD_MEMBER]->(s)\n", - " WITH s, board_member\n", - " CALL apoc.create.addLabels(s, [board_member.type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", "CALL {\n", " WITH o, row\n", " UNWIND row.subsidiaries AS subsidiary\n", - " MERGE (s:`__Entity__` {id: subsidiary.diffbotUri})\n", + " MERGE (s:`Organization` {id: subsidiary.diffbotUri})\n", " ON CREATE SET s.name = subsidiary.name,\n", " s.summary = subsidiary.summary\n", " MERGE (o)-[:HAS_SUBSIDIARY]->(s)\n", - " WITH s, subsidiary\n", - " CALL apoc.create.addLabels(s, [subsidiary.type]) YIELD node\n", + " RETURN count(*) AS count\n", + "}\n", + "WITH o, row\n", + "CALL {\n", + " WITH o, row\n", + " UNWIND row.secForms AS secForm\n", + " MERGE (s:`SecForm` {filingUrl: secForm.filingUrl})\n", + " ON CREATE SET s.formType = secForm.formType,\n", + " s.documentUrl = secForm.documentUrl,\n", + " s.filingDate = secForm.dateClean\n", + " MERGE (o)-[:HAS_SEC_FORM]->(s)\n", + " RETURN count(*) AS count\n", + "}\n", + "WITH o, row\n", + "CALL {\n", + " WITH o, row\n", + " UNWIND row.yearlyRevenues AS yearlyRevenue\n", + " MERGE (s:`RevenueYear` {id: o.name + toString(yearlyRevenue.year)})\n", + " ON CREATE SET s.year = yearlyRevenue.year,\n", + " s.revenue = toFloat(yearlyRevenue.revenue.amount),\n", + " s.currency = yearlyRevenue.revenue.currency\n", + " MERGE (o)-[:HAS_YEARLY_REVENUE]->(s)\n", + " WITH s ORDER BY s.year\n", + " WITH collect(s) AS revenues\n", + " CALL apoc.nodes.link(revenues, \"NEXT_YEAR\")\n", + " RETURN count(*) AS count\n", + "}\n", + "WITH o, row\n", + "CALL {\n", + " WITH o, row\n", + " UNWIND row.technographics AS technographic\n", + " MERGE (s:`Technographic` {name: technographic.technology.name})\n", + " MERGE (o)-[:HAS_TECHNOGRAPHIC]->(s)\n", + " WITH s, technographic\n", + " UNWIND technographic.categories AS category\n", + " MERGE (c:`TechnographicCategory` {name: category})\n", + " MERGE (s)-[:HAS_CATEGORY]->(c)\n", + " RETURN count(*) AS count\n", + "}\n", + "WITH o, row\n", + "CALL {\n", + " WITH o, row\n", + " UNWIND row.investments AS investment\n", + " MERGE (is:`InvestmentSeries` {id: o.name + investment.date.str})\n", + " ON CREATE SET is.amount = investment.amount.value,\n", + " is.series = investment.series,\n", + " is.currency = investment.amount.currency,\n", + " is.date = investment.dateClean\n", + " MERGE (o)-[:HAS_INVESTMENT]->(is)\n", + " WITH is, investment\n", + " UNWIND investment.investors as investor\n", + " CALL apoc.merge.node(\n", + " [investor.type],\n", + " {id: investor.diffbotUri},\n", + " {name: investor.name, summary: investor.summary},\n", + " {}\n", + " ) YIELD node AS es\n", + " MERGE (es)-[:HAS_INVESTED]->(is)\n", + " RETURN count(*) AS count\n", + "}\n", + "WITH o, row\n", + "CALL {\n", + " WITH o, row\n", + " UNWIND row.locations AS location\n", + " MERGE (s:`Place` {address: location.address})\n", + " ON CREATE SET s.latitude = toFloat(location.latitude),\n", + " s.longitude = toFloat(location.longitude)\n", + " MERGE (o)-[h:HAS_LOCATION]->(s)\n", + " SET h.isPrimary = location.isPrimary\n", + " WITH s, location\n", + " WHERE location.country IS NOT NULL\n", + " MERGE (c:`Country` {id: location.country.diffbotUri})\n", + " ON CREATE SET c.name = location.country.name\n", + " MERGE (s)-[hc:HAS_COUNTRY]->(c)\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", @@ -515,12 +576,10 @@ " WITH o, row\n", " WITH o, row\n", " WHERE row.ceo IS NOT NULL\n", - " MERGE (c:`__Entity__` {id: row.ceo.diffbotUri})\n", + " MERGE (c:`Person` {id: row.ceo.diffbotUri})\n", " ON CREATE SET c.name = row.ceo.name,\n", " c.summary = row.ceo.summary\n", " MERGE (o)-[:HAS_CEO]->(c)\n", - " WITH c, row.ceo.type AS type\n", - " CALL apoc.create.addLabels(c, [type]) YIELD node\n", " RETURN count(*) AS count\n", "}\n", "WITH o, row\n", @@ -535,7 +594,7 @@ "metadata": { "id": "1ZPAuesYDcGJ" }, - "execution_count": 98, + "execution_count": 93, "outputs": [] }, { @@ -548,19 +607,19 @@ "base_uri": "https://localhost:8080/" }, "id": "F2z0DfbUU9gj", - "outputId": "b33ba816-ee29-437b-a110-cec4c885aec8" + "outputId": "bd6c1b16-4e79-4a70-86f6-b0d3c6f6af90" }, - "execution_count": 99, + "execution_count": 94, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "EagerResult(records=[], summary=, keys=[])" + "EagerResult(records=[], summary=, keys=[])" ] }, "metadata": {}, - "execution_count": 99 + "execution_count": 94 } ] }, @@ -570,7 +629,7 @@ "metadata": { "id": "_kwlF5wDVmCT" }, - "execution_count": null, + "execution_count": 87, "outputs": [] } ]