-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Ubuntu
committed
Jul 20, 2020
1 parent
0f8ee78
commit b63180a
Showing
30 changed files
with
882 additions
and
0 deletions.
There are no files selected for viewing
226 changes: 226 additions & 0 deletions
226
PTSemanticSearch/Notebooks/01_look_at_Semantic_Corpus.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Look at Semantic Scholar Courpus Data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": "The autoreload extension is already loaded. To reload it, use:\n %reload_ext autoreload\n" | ||
} | ||
], | ||
"source": [ | ||
"%load_ext autoreload\n", | ||
"%autoreload 2\n", | ||
"%reload_ext autoreload\n", | ||
"\n", | ||
"%matplotlib inline\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import h5py\n", | ||
"from pathlib import Path\n", | ||
"\n", | ||
"from importlib.util import find_spec\n", | ||
"if find_spec(\"similarity_abstract_search\") is None:\n", | ||
" import sys\n", | ||
" sys.path.append('..')\n", | ||
" \n", | ||
"from similarity_abstract_search.datasets.dataset import SemanticCorpusDataset \n", | ||
"from similarity_abstract_search import utils" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"Path.ls = lambda x: list(x.iterdir())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": "[PosixPath('../../Data/processed/paperID2emb.json'),\n PosixPath('../../Data/processed/embedIDs.h5'),\n PosixPath('../../Data/processed/paper_set.txt'),\n PosixPath('../../Data/processed/SemanticScholarData'),\n PosixPath('../../Data/processed/manifest.txt')]" | ||
}, | ||
"metadata": {}, | ||
"execution_count": 5 | ||
} | ||
], | ||
"source": [ | ||
"data_dir = Path('../../Data/processed/')\n", | ||
"data_dir.ls()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": "../../Data/processed/SemanticScholarData/pruned_and_clean103.json\n" | ||
}, | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": "[PosixPath('../../Data/processed/SemanticScholarData/pruned_and_clean103.json'),\n PosixPath('../../Data/processed/SemanticScholarData/pruned_and_clean088.json'),\n PosixPath('../../Data/processed/SemanticScholarData/pruned_and_clean047.json')]" | ||
}, | ||
"metadata": {}, | ||
"execution_count": 6 | ||
} | ||
], | ||
"source": [ | ||
"data_fn = (data_dir/'SemanticScholarData').ls()\n", | ||
"fn = data_fn[0]\n", | ||
"print(fn)\n", | ||
"data_fn[:3]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"inner_path = str(data_dir/'SemanticScholarData')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 77, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = SemanticCorpusDataset()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 78, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": " id \\\nEmbeddingID \n10291659 3b9b9509bb4eea4711515a40a7f210d8561487aa \n13584557 c4c1d7a8051a5ee1562dc8b5722c3353b8d7e226 \n9811857 e300444162aa59fe05c18f99e36db8a1125275c3 \n764869 57c98b205d48d605f17d884bd6abe9a66c846989 \n10295489 e316c2902c421370001baf099e439cf68bef62fa \n... ... \n13139466 10e10bb1a46166152458e1959eadab88ab604d3d \n707527 e0ae60948a24dfa0e6296effb7750c00391f7d46 \n9324370 99d8a0df8507fc78e3a8cf4cc68bb3134a4d4b20 \n9802191 7983389420ea39ec8dc8440b921319e1d154601e \n9214270 3a4b5e3ece7dbeefc5e1664ea6a078e60c822f99 \n\n paperAbstract \\\nEmbeddingID \n10291659 Anterior segment dysgeneses are developmental ... \n13584557 The first synthesis of dolabelide C (1), a cyt... \n9811857 BACKGROUND & AIMS: Helicobacter pylori eradica... \n764869 BACKGROUND\\nMinorities are more prevalent than... \n10295489 Today, many tetraplegics benefit from surgical... \n... ... \n13139466 'Metadata' has received a fraction of the atte... \n707527 OBJECTIVES\\nTo describe average levels of free... \n9324370 I N VIVO confocal laser scanning microscopy (C... \n9802191 INTRODUCTION\\nCongenital tuberculosis is a rar... \n9214270 Development of sporangia in Phytophthora palmi... \n\n title \\\nEmbeddingID \n10291659 The 6p25 deletion syndrome: An update on a rar... \n13584557 Total synthesis of dolabelide C: a phosphate-m... \n9811857 Effects of Community Screening for Helicobacte... \n764869 Racial and ethnic disparities in physical abus... \n10295489 New concepts on treatment of the upper limb in... \n... ... \n13139466 Metadata accounts: Achieving data and evidence... \n707527 Human energy expenditure in affluent societies... \n9324370 The vascular features of psoriatic skin: imagi... \n9802191 Analysis of 170 cases of congenital TB reporte... \n9214270 Growth and ultrastructural differentiation of ... \n\n citeEmbeddingsID \nEmbeddingID \n10291659 [12279879, 11377917, 10611622, 4885766, 671395... \n13584557 [5007102] \n9811857 [9936039, 1851207, 446836, 11518128, 9864285, ... \n764869 [6924879, 457180] \n10295489 [7564871, 10920400, 13471692, 3174646, 10921211] \n... ... \n13139466 [2442410] \n707527 [1997359, 11775545, 7111644, 86053, 3490113, 1... \n9324370 [11362903, 12617902, 8475976, 12960323, 257405... \n9802191 [3540689, 12109320, 10107597, 983278, 9470101,... \n9214270 [3672511] \n\n[64 rows x 4 columns]", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>paperAbstract</th>\n <th>title</th>\n <th>citeEmbeddingsID</th>\n </tr>\n <tr>\n <th>EmbeddingID</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>10291659</th>\n <td>3b9b9509bb4eea4711515a40a7f210d8561487aa</td>\n <td>Anterior segment dysgeneses are developmental ...</td>\n <td>The 6p25 deletion syndrome: An update on a rar...</td>\n <td>[12279879, 11377917, 10611622, 4885766, 671395...</td>\n </tr>\n <tr>\n <th>13584557</th>\n <td>c4c1d7a8051a5ee1562dc8b5722c3353b8d7e226</td>\n <td>The first synthesis of dolabelide C (1), a cyt...</td>\n <td>Total synthesis of dolabelide C: a phosphate-m...</td>\n <td>[5007102]</td>\n </tr>\n <tr>\n <th>9811857</th>\n <td>e300444162aa59fe05c18f99e36db8a1125275c3</td>\n <td>BACKGROUND & AIMS: Helicobacter pylori eradica...</td>\n <td>Effects of Community Screening for Helicobacte...</td>\n <td>[9936039, 1851207, 446836, 11518128, 9864285, ...</td>\n </tr>\n <tr>\n <th>764869</th>\n <td>57c98b205d48d605f17d884bd6abe9a66c846989</td>\n <td>BACKGROUND\\nMinorities are more prevalent than...</td>\n <td>Racial and ethnic disparities in physical abus...</td>\n <td>[6924879, 457180]</td>\n </tr>\n <tr>\n <th>10295489</th>\n <td>e316c2902c421370001baf099e439cf68bef62fa</td>\n <td>Today, many tetraplegics benefit from surgical...</td>\n <td>New concepts on treatment of the upper limb in...</td>\n <td>[7564871, 10920400, 13471692, 3174646, 10921211]</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>13139466</th>\n <td>10e10bb1a46166152458e1959eadab88ab604d3d</td>\n <td>'Metadata' has received a fraction of the atte...</td>\n <td>Metadata accounts: Achieving data and evidence...</td>\n <td>[2442410]</td>\n </tr>\n <tr>\n <th>707527</th>\n <td>e0ae60948a24dfa0e6296effb7750c00391f7d46</td>\n <td>OBJECTIVES\\nTo describe average levels of free...</td>\n <td>Human energy expenditure in affluent societies...</td>\n <td>[1997359, 11775545, 7111644, 86053, 3490113, 1...</td>\n </tr>\n <tr>\n <th>9324370</th>\n <td>99d8a0df8507fc78e3a8cf4cc68bb3134a4d4b20</td>\n <td>I N VIVO confocal laser scanning microscopy (C...</td>\n <td>The vascular features of psoriatic skin: imagi...</td>\n <td>[11362903, 12617902, 8475976, 12960323, 257405...</td>\n </tr>\n <tr>\n <th>9802191</th>\n <td>7983389420ea39ec8dc8440b921319e1d154601e</td>\n <td>INTRODUCTION\\nCongenital tuberculosis is a rar...</td>\n <td>Analysis of 170 cases of congenital TB reporte...</td>\n <td>[3540689, 12109320, 10107597, 983278, 9470101,...</td>\n </tr>\n <tr>\n <th>9214270</th>\n <td>3a4b5e3ece7dbeefc5e1664ea6a078e60c822f99</td>\n <td>Development of sporangia in Phytophthora palmi...</td>\n <td>Growth and ultrastructural differentiation of ...</td>\n <td>[3672511]</td>\n </tr>\n </tbody>\n</table>\n<p>64 rows × 4 columns</p>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"execution_count": 78 | ||
} | ||
], | ||
"source": [ | ||
"data.load_one_batch()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 79, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": "Title is : Population studies and validation of paternity determinations by six microsatellite loci.\nAbstract is : A single locus system of 6 microsatellite markers was evaluated for paternity testing. A nonradioactive method based on peroxidase labeling of a DNA probe was used to estimate the allele frequency of markers D1S216, D3S1217, D7S480, D9S157, D13S153, and D16S422 by genotyping 1134-1698 chromosomes. The number of detected alleles were 22, 15, 23, 10, 16, and 19, respectively, and the allele frequency varied from 0.001 to 0.317. The genotype of 87 families, consisting of mother, father, and child was determined. The probability that a random individual will give a positive paternity was evaluated. We conclude that the markers can be reliably typed and give sufficient and reliable information for paternity testing.\nLink is : https://www.semanticscholar.org/paper/7d9addb22565ebf79a19b1a77683d33ff0663f7d\n" | ||
} | ||
], | ||
"source": [ | ||
"data.show_data()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"h5file = utils.loadH5file(data_dir/'embedIDs.h5')\n", | ||
"trainData = np.column_stack(( h5file['paper' ][:] , h5file['cites' ][:] ))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 25, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": "(array([10291659, 13584557, 9811857, 764869], dtype=int32),\n array([array([12279879, 11377917, 10611622, 4885766, 6713959, 11472980,\n 4985806, 5487724, 684838, 13727658, 7407284, 3248404,\n 4723212, 5442396, 5061212, 10502502, 9808902, 8182254,\n 9085600, 3028060, 9996880, 9661559, 11418460, 9824852,\n 8882707, 8722993], dtype=int32),\n array([5007102], dtype=int32),\n array([ 9936039, 1851207, 446836, 11518128, 9864285, 6948941,\n 12636073, 1527052, 11381674, 8982488, 6955016], dtype=int32),\n array([6924879, 457180], dtype=int32)], dtype=object))" | ||
}, | ||
"metadata": {}, | ||
"execution_count": 25 | ||
} | ||
], | ||
"source": [ | ||
"h5file['paper'][:4], h5file['cites'][:4]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": "(13786318, 2)" | ||
}, | ||
"metadata": {}, | ||
"execution_count": 26 | ||
} | ||
], | ||
"source": [ | ||
"trainData.shape" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": 3 | ||
}, | ||
"orig_nbformat": 2, | ||
"kernelspec": { | ||
"name": "python_defaultSpec_1595191428755", | ||
"display_name": "Python 3.7.7 64-bit ('base': conda)" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.