Skip to content

Commit

Permalink
Redo pdf parsing with promises; update UI to show search results + a …
Browse files Browse the repository at this point in the history
…few other changes
  • Loading branch information
roman-r-m committed Jan 29, 2021
1 parent c250980 commit 8cd9f20
Showing 1 changed file with 80 additions and 80 deletions.
160 changes: 80 additions & 80 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,79 +1,59 @@
import joplin from 'api';
import JoplinData from 'api/JoplinData';
import { MenuItemLocation } from 'api/types';

async function extractText(path: string): Promise<string> {
return new Promise((resolve, reject) => {
pdfToText(path, resolve);
});
}
import { MenuItemLocation, ViewHandle } from 'api/types';
import Joplin from 'api/Joplin';

import * as pdfjs from 'pdfjs-dist/es5/build/pdf.js';
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry';
import JoplinViewsMenuItems from 'api/JoplinViewsMenuItems';

pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorker;

async function pdfToText(path) {
const pdf = await pdfjs.getDocument(path).promise;
let complete = 0;
let total = pdf.numPages;
let pages = {};
for (let pagei = 1; pagei <= total; pagei++) {
const page = await pdf.getPage(pagei);
let pageNumber = page.pageNumber;
const textContent = await page.getTextContent();
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}

async function pdfToText(path, callbackAllDone) {
let PDFJS = require('pdfjs-dist/es5/build/pdf.js');
const pdfjsWorker = await import('pdfjs-dist/build/pdf.worker.entry');
PDFJS.GlobalWorkerOptions.workerSrc = pdfjsWorker;

PDFJS.getDocument(path).promise.then(function(pdf) {
let complete = 0;
let total = pdf.numPages;
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace

page_text += item.str;
last_item = item;
} // ends for every item of text

textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items

++complete;

// If all done, put pages in order and combine all
// text, then pass that to the callback
if (complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
page_text += item.str;
last_item = item;
}
pages[pageNumber] = page_text + "\n\n";
}
++complete;
if (complete == total) {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
return full_text;
}
}
return '';
}

async function initDb(path: string) {
const sqlite3 = joplin.plugins.require('sqlite3');
await (require('fs-extra').remove(`${path}/resource.sqlite`));
const db = new sqlite3.Database(`${path}/resource.sqlite`);
await db.run('CREATE VIRTUAL TABLE IF NOT EXISTS resources_fts USING fts5(id, title, text)');
return db;
Expand All @@ -97,22 +77,42 @@ async function indexResource(resource: any, resourceDir: string, db: any) {
if (resource.mime === 'application/pdf') {
const fs = joplin.plugins.require('fs-extra'); // TODO import once

const text = await extractText(`${resourceDir}/${resource.id}.pdf`);
const text = await pdfToText(`${resourceDir}/${resource.id}.pdf`);
console.log(`extracted text from ${resource.title}: ${text.substring(0, 100)}`);

await db.run('INSERT INTO resources_fts VALUES(?, ?, ?)', resource.id, resource.title, text);
}
}

async function onSearchResult(joplin: Joplin, dialog: ViewHandle, searchResult: any[]) {
console.log(`result: ${JSON.stringify(searchResult)}`);

let html = `<div><table style="table-layout: auto;"><thead><tr><th>Title</th><th>Included in note(s)</th></thead><tbody>
</div>`;
for (let i = 0; i < searchResult.length; i++) {
const result = searchResult[i];
const notes = (await joplin.data.get(['resources', result.id, 'notes'], { fields: ['id', 'title']})).items;
const noteTitle = !!notes && notes.length > 0 ? notes[0].title : '';
html += `
<tr>
<td>${result.title}</td>
<td>
<a href="#" onclick="" >${noteTitle}</a>
</td>
</tr>`;
}
html += '</tbody></table>'
await joplin.views.dialogs.setHtml(dialog, html);
await joplin.views.dialogs.open(dialog);
}

joplin.plugins.register({
onStart: async function() {
console.info('Plugin started!');
const profileDir = await joplin.settings.globalValue('profileDir');
const resourceDir = await joplin.settings.globalValue('resourceDir');
console.log(`the profile is in ${profileDir}`);

const profileDir = await joplin.plugins.dataDir();
const db = await initDb(profileDir);

const resourceDir = await joplin.settings.globalValue('resourceDir');
await indexResources(joplin.data, resourceDir, db);

const searchDialogHandle = await joplin.views.dialogs.create('searchDialog');
Expand All @@ -122,19 +122,19 @@ joplin.plugins.register({
</form>
`);

const searchResultsDialogHandle = await joplin.views.dialogs.create('resultsDialog');

await joplin.commands.register({
name: 'searchAttachments',
label: 'Search in attachments',
execute: async () => {
console.log('here be search');
const result = await joplin.views.dialogs.open(searchDialogHandle);
console.log(`and the query is ${JSON.stringify(result)}`);

if (result.id === 'ok') {
const query = result.formData.form.query;
console.log(`query ${query}`);
db.all('SELECT * FROM resources_fts WHERE text MATCH ?', query, (err, searchResult) => {
console.log(`result: ${JSON.stringify(searchResult)}`);
});
db.all('SELECT id,title FROM resources_fts WHERE text MATCH ?', query, async (_err, searchResult) =>
onSearchResult(joplin, searchResultsDialogHandle, searchResult)
);
}
},
})
Expand Down

0 comments on commit 8cd9f20

Please sign in to comment.