Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@ install:
- pip install sphinx
- pip install sphinx_rtd_theme
- pip install bibtexparser
- pip install numpy
- pip install sklearn
- pip install plotly
# - pip install pprint
# - sudo apt-get install python3-sphinx

script:
# Update paper list
- python bibtex2rst.py
- python embeddings.py
# Use Sphinx to make the html docs
- make html
# Tell GitHub not to use jekyll to compile the docs
Expand Down
88 changes: 88 additions & 0 deletions embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
from bibtex2rst import (bibtex_path, bib_files, template_file_path,
str2injcet, papers_count, sec_descriptions)
from reduce import get_2d_coordinates
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import json
import plotly

def vectorize_text(text):
vect_text = TfidfVectorizer(
lowercase=True,
ngram_range=(1,3),
stop_words='english',
max_df=7,
min_df=2
).fit_transform(text).toarray()
return vect_text

papers = list()

for i, bibfile in enumerate(bib_files):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find the reference to bib_files. Where is it defined?

Can you try to build the wiki (make html) after your changes and make sure it does not throw errors?

Let me know 😄

sec_title = bibfile.split("-")[1][:-4]
with open(os.path.join(bibtex_path, bibfile)) as bibtex_file:
parser = BibTexParser()
parser.customization = convert_to_unicode
bib_database = bibtexparser.load(bibtex_file, parser=parser)

with open(template_file_path) as rf:
template_str = rf.read()

str2injcet += sec_title + \
"\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n" + \
"**" + papers_count[bibfile] + " papers" + "**\n\n" + \
sec_descriptions[i] + "\n\n"

for item in sorted(bib_database.entries, key=lambda j: j['year'], reverse=True):
if 'abstract' in item.keys():
abstract, title, author, url = item['abstract'], item['title'], item['author'], item['url']

papers.append((abstract, title, author, url, i))
else:
continue

papers = np.array(papers)
vectorized_abstracts = vectorize_text(papers[:, 0])
out = get_2d_coordinates(vectorized_abstracts)

db = {}
db['annotation_text'] = list()
db['sec_title'] = list()
db['pos_x'] = list()
db['pos_y'] = list()
for i in range(len(papers)):
annotation_text = f"<a href=\"{papers[i][3]}\">{papers[i][1]} ({papers[i][2]})</a>"
db['annotation_text'].append(annotation_text)
db['sec_title'].append(papers[i][4])
db['pos_x'].append(out[i].tolist()[0])
db['pos_y'].append(out[i].tolist()[1])

with open('embedding.json', 'w') as output:
json.dump(db, output)


trace = plotly.graph_objs.Scatter(
x=db['pos_x'],
y=db['pos_y'],
mode='markers',
marker=dict(
color='LightSkyBlue',
size=15,
line=dict(
color='MediumPurple',
width=2
)
),
textfont=dict(
color='black',
size=10,
),
marker_color=list(map(int, db['sec_title'])),
text = db['annotation_text'],
textposition="top center",
hoverinfo="text",
)
plotly.offline.plot([trace], filename='embedding-plot.html')
10 changes: 10 additions & 0 deletions reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from sklearn.manifold import TSNE


def get_2d_coordinates(vectorized_text):
out = TSNE(
n_components=2,
random_state=42
).fit_transform(vectorized_text)
return out

2 changes: 2 additions & 0 deletions research_template.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ in the ability to learn continuously from high-dimensional data. In this page,
we will keep track of recent **Continual/Lifelong** Learning developments in
the research community.

.. raw:: html
:file: embedding-plot.html

Publications
----------------------------------
Expand Down