Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions papers/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import itertools
import fnmatch # unix-like match
from slugify import slugify
import concurrent.futures

import papers
from papers import logger
Expand Down Expand Up @@ -790,7 +791,40 @@ def fetchcmd(parser, o):
print(fetch_bibtex_by_fulltext_crossref(field))

def extractcmd(parser, o):
print(extract_pdf_metadata(o.pdf, search_doi=not o.fulltext, search_fulltext=True, scholar=o.scholar, minwords=o.word_count, max_query_words=o.word_count, image=o.image))
if os.path.isdir(o.pdf) and o.recursive:
pdf_files = Path(o.pdf).rglob('*.pdf')
futures = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for pdf in pdf_files:
future = executor.submit(extract_pdf_metadata,
pdf,
search_doi=not o.fulltext,
search_fulltext=True,
scholar=o.scholar,
minwords=o.word_count,
max_query_words=o.word_count,
image=o.image)
print(future.result())
futures.append(future)
del pdf_files
# for future in futures:
# print(future.result())
del futures
# OK, a note on the above: clearly, theres parallelization to
# be gained here from doing this all concurrently using futures
# boyan.penkov saw this run locally on his machine; however
# the parallel writes to .cache/papers/crossref.json and
# crossref-bibtex.json have race conditions, and clobber
# the file format, leaving the base command papers unusable
# with json load failures. I'd be glad to fix it, but for now
# we have to do this serially.
# I'd rather leave the futures thing in there, since it does
# work and is a nice path to a clear speedup TODO.
elif os.path.isfile(o.pdf) == 1 and o.pdf.endswith('.pdf'):
print(extract_pdf_metadata(o.pdf, search_doi=not o.fulltext, search_fulltext=True, scholar=o.scholar, minwords=o.word_count, max_query_words=o.word_count, image=o.image))
else:
raise ValueError('extract requires a single pdf or a directory and --recursive.')
# TODO trivially extend this for len(o.file) > 1, but no dir
# print(fetch_bibtex_by_doi(o.doi))


Expand Down Expand Up @@ -1265,6 +1299,7 @@ def get_parser(config=None):
extractp.add_argument('--fulltext', action='store_true', help='fulltext only (otherwise DOI-based)')
extractp.add_argument('--scholar', action='store_true', help='use google scholar instead of default crossref for fulltext search')
extractp.add_argument('--image', action='store_true', help='convert to image and use tesseract instead of pdftotext')
extractp.add_argument('--recursive', action='store_true', help='takes one directory as an arguement; recursively descends into it and shows extracted bibibinfo for each pdf')

# *** Pure OS related file checks ***

Expand Down Expand Up @@ -1396,4 +1431,4 @@ def main_clean_exit(args=None):
if __name__ == "__main__":
# we use try/except here to use a clean exit instead of trace
# test and debugging may use main() directly for speed-up => better to avoid sys.exit there
main_clean_exit()
main_clean_exit()
18 changes: 17 additions & 1 deletion papers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,20 @@ def _update_paths_to_absolute(self):

def status(self, check_files=False, verbose=False):

def _count_files_in_bibtex(db):
"""
Given a bibtexparser database, return the file count
in it, over all the guys that have multiple files.
"""
file_count = 0
for entry in db.entries:
# assumes papers only sticks things in a file = {:whatever.pdf:pdf} line
if 'file' in entry:
# assumes papers has multiple files separated by ';'
files = entry['file'].split('.pdf:pdf;')
file_count += len(files)
return file_count

def _fmt_path(p):
if self.local:
return os.path.relpath(p, ".")
Expand Down Expand Up @@ -210,7 +224,9 @@ def _fmt_path(p):
bibtexstring = open(self.bibtex).read()
db = parse_string(bibtexstring)
if len(db.entries):
status = bcolors.OKBLUE+' ({} entries)'.format(len(db.entries))+bcolors.ENDC
file_count = _count_files_in_bibtex(db)
status = bcolors.OKBLUE+' ({} files in {} entries)'.format(file_count, len(db.entries))+bcolors.ENDC
del file_count
else:
status = bcolors.WARNING+' (empty)'+bcolors.ENDC
except:
Expand Down
2 changes: 0 additions & 2 deletions tests/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
import shutil
import subprocess as sp
import tempfile
import unittest
from pathlib import Path

import bibtexparser
from papers.entries import parse_file as bp_parse_file, parse_string, get_entry_val
from papers.encoding import entry_to_unicode_dict

Expand Down
137 changes: 131 additions & 6 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,150 @@
import unittest
import os
import tempfile
import shutil

from papers.extract import extract_pdf_metadata
from papers.entries import parse_string
from tests.common import paperscmd, prepare_paper

from papers.bib import Biblio
from tests.common import paperscmd, prepare_paper, prepare_paper2, BibTest


class TestSimple(unittest.TestCase):

def setUp(self):
self.pdf, self.doi, self.key, self.newkey, self.year, self.bibtex, self.file_rename = prepare_paper()
(
self.pdf,
self.doi,
self.key,
self.newkey,
self.year,
self.bibtex,
self.file_rename,
) = prepare_paper()
self.assertTrue(os.path.exists(self.pdf))

def test_doi(self):
self.assertEqual(paperscmd(f'doi {self.pdf}', sp_cmd='check_output').strip(), self.doi)
self.assertEqual(
paperscmd(f"doi {self.pdf}", sp_cmd="check_output").strip(), self.doi
)

def test_fetch(self):
bibtexs = paperscmd(f'fetch {self.doi}', sp_cmd='check_output').strip()
bibtexs = paperscmd(f"fetch {self.doi}", sp_cmd="check_output").strip()
db1 = parse_string(bibtexs)
db2 = parse_string(self.bibtex)
self.assertEqual([dict(e.items()) for e in db1.entries], [dict(e.items()) for e in db2.entries])
self.assertEqual(
[dict(e.items()) for e in db1.entries],
[dict(e.items()) for e in db2.entries],
)

def test_fetch_scholar(self):
extract_pdf_metadata(self.pdf, scholar=True)
extract_pdf_metadata(self.pdf, scholar=True)


class TestAddDir(BibTest):
# TODO delete this later
def setUp(self):
(
self.pdf1,
self.doi,
self.key1,
self.newkey1,
self.year,
self.bibtex1,
self.file_rename1,
) = prepare_paper()
(
self.pdf2,
self.si,
self.doi,
self.key2,
self.newkey2,
self.year,
self.bibtex2,
self.file_rename2,
) = prepare_paper2()
self.somedir = tempfile.mktemp(prefix="papers.somedir")
self.subdir = os.path.join(self.somedir, "subdir")
os.makedirs(self.somedir)
os.makedirs(self.subdir)
shutil.copy(self.pdf1, self.somedir)
shutil.copy(self.pdf2, self.subdir)
self.mybib = tempfile.mktemp(prefix="papers.bib")
paperscmd(f"install --local --no-prompt --bibtex {self.mybib}")

def test_adddir_pdf(self):
self.my = Biblio.load(self.mybib, "")
self.my.scan_dir(self.somedir)
self.assertEqual(len(self.my.db.entries), 2)
keys = [self.my.db.entries[0]["ID"], self.my.db.entries[1]["ID"]]
self.assertEqual(
sorted(keys), sorted([self.newkey1, self.newkey2])
) # PDF: update key

def test_adddir_pdf_cmd(self):
paperscmd(f"add --recursive --bibtex {self.mybib} {self.somedir}")
self.my = Biblio.load(self.mybib, "")
self.assertEqual(len(self.my.db.entries), 2)
keys = [self.my.db.entries[0]["ID"], self.my.db.entries[1]["ID"]]
self.assertEqual(
sorted(keys), sorted([self.newkey1, self.newkey2])
) # PDF: update key

def tearDown(self):
os.remove(self.mybib)
shutil.rmtree(self.somedir)
paperscmd(f"uninstall")


class TestRecursiveExtract(unittest.TestCase):

def setUp(self):
(
self.pdf1,
self.doi1,
self.key1,
self.newkey1,
self.year1,
self.bibtex1,
self.file_rename1,
) = prepare_paper()
(
self.pdf2,
self.si2,
self.doi2,
self.key2,
self.newkey2,
self.year2,
self.bibtex2,
self.file_rename2,
) = prepare_paper2()
self.somedir = tempfile.mktemp(prefix="papers.somedir")
self.subdir = os.path.join(self.somedir, "subdir")
os.makedirs(self.somedir)
os.makedirs(self.subdir)
shutil.copy(self.pdf1, self.somedir)
shutil.copy(self.pdf2, self.subdir)
self.mybib = tempfile.mktemp(prefix="papers.bib")
paperscmd(f"install --local --no-prompt --bibtex {self.mybib}")
self.assertTrue(os.path.exists(self.pdf1))
self.assertTrue(os.path.exists(self.pdf2))

def test_doi(self):
self.assertEqual(
paperscmd(f"doi {self.pdf1}", sp_cmd="check_output").strip(), self.doi1
)

def test_fetch(self):
bibtexs = paperscmd(f"extract {self.pdf1}", sp_cmd="check_output").strip()
db1 = parse_string(bibtexs)
db2 = parse_string(self.bibtex1)
self.assertEqual(
[dict(e.items()) for e in db1.entries],
[dict(e.items()) for e in db2.entries],
)

def tearDown(self):
os.remove(self.mybib)
shutil.rmtree(self.somedir)
paperscmd(f"uninstall")
9 changes: 2 additions & 7 deletions tests/test_filecheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@
"""
import os
import shutil
import subprocess as sp
import tempfile
import unittest
from pathlib import Path

import bibtexparser

from papers.bib import Biblio
from papers.entries import get_entry_val
from tests.common import PAPERSCMD, paperscmd, prepare_paper, prepare_paper2, BibTest
from tests.common import paperscmd, prepare_paper, BibTest


class TestFileCheck(BibTest):
Expand Down Expand Up @@ -99,4 +94,4 @@ def test_filecheck_clean_filesdir(self):
self.papers('uninstall')

def tearDown(self):
self.temp_dir.cleanup()
self.temp_dir.cleanup()