Skip to content

Commit

Permalink
refactor database logic
Browse files Browse the repository at this point in the history
  • Loading branch information
roman-r-m committed Feb 5, 2021
1 parent db0b909 commit 42abbdb
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 68 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
],
"devDependencies": {
"@types/node": "^14.0.14",
"@types/sqlite3": "^3.1.7",
"canvas": "^2.6.1",
"chalk": "^4.1.0",
"copy-webpack-plugin": "^6.1.0",
Expand Down
5 changes: 4 additions & 1 deletion plugin.config.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{
"extraScripts": []
"extraScripts": [
"index/pdf.ts",
"database.ts"
]
}
54 changes: 54 additions & 0 deletions src/database.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import { Database, sqlite3 } from 'sqlite3';

const SCHEMA_VERSION = 1;

function query(db: Database, query, ...params) {
return new Promise((resolve, reject) => {
db.run(query, params, (err, rows) => {
if (!!err) {
reject(err);
} else {
resolve(rows);
}
});
});
}

function run(db: Database, query, ...params) {
return new Promise(function(resolve, reject) {
db.run(query, params, function(err) {
if(err) {
reject(err.message);
} else {
resolve(true);
}
});
});
}

async function initDb(path: string, sqlite3: sqlite3): Promise<Database> {
const db: Database = new sqlite3.Database(`${path}/resources.sqlite`);
await run(db, 'CREATE TABLE IF NOT EXISTS settings (name TEXT PRIMARY KEY, value TEXT)');
const version = await query(db, 'SELECT value FROM settings WHERE name = ?', 'version');
if (version !== SCHEMA_VERSION) {
// rebuild index
await run(db, 'DROP TABLE IF EXISTS resources_fts');
await run(db, 'CREATE VIRTUAL TABLE IF NOT EXISTS resources_fts USING fts5(id, title, text)');

// create or clean index_time
await run(db, 'CREATE TABLE IF NOT EXISTS index_time (id TEXT PRIMARY KEY, index_time INTEGER)');
await run(db, 'DELETE FROM index_time');

await run(db, 'VACUUM');

await run(db, `INSERT INTO settings VALUES('version', ${SCHEMA_VERSION}) ON CONFLICT(name) DO UPDATE SET value=${SCHEMA_VERSION}`);
}
return db;
}

function addToIndex(db: Database, title, id, text) {
db.run('INSERT INTO resources_fts VALUES(?, ?, ?)', id, title, text);
db.run('INSERT INTO index_time VALUES(?, ?) ON CONFLICT(id) DO UPDATE SET index_time = ?', id, Date.now() / 1000, Date.now() / 1000);
}

export { initDb, run, query, addToIndex };
85 changes: 18 additions & 67 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,85 +2,35 @@ import joplin from 'api';
import JoplinData from 'api/JoplinData';
import { MenuItemLocation, ViewHandle } from 'api/types';
import Joplin from 'api/Joplin';
import { pdfToText} from './index/pdf';
import { addToIndex, initDb, query } from './database';
import { Database } from 'sqlite3';

import * as pdfjs from 'pdfjs-dist/es5/build/pdf.js';
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry';
import JoplinViewsMenuItems from 'api/JoplinViewsMenuItems';

pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorker;

async function pdfToText(path) {
const pdf = await pdfjs.getDocument(path).promise;
let complete = 0;
let total = pdf.numPages;
let pages = {};
for (let pagei = 1; pagei <= total; pagei++) {
const page = await pdf.getPage(pagei);
let pageNumber = page.pageNumber;
const textContent = await page.getTextContent();
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}

page_text += item.str;
last_item = item;
}
pages[pageNumber] = page_text + "\n\n";
}
++complete;
if (complete == total) {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
return full_text;
}
}
return '';
}

async function initDb(path: string) {
const sqlite3 = joplin.plugins.require('sqlite3');
await (require('fs-extra').remove(`${path}/resource.sqlite`));
const db = new sqlite3.Database(`${path}/resource.sqlite`);
await db.run('CREATE VIRTUAL TABLE IF NOT EXISTS resources_fts USING fts5(id, title, text)');
return db;
}

async function indexResources(api: JoplinData, resourceDir: string, db: any) {
async function indexResources(api: JoplinData, resourceDir: string, db: Database) {
let page = 0;
let response = await api.get(['resources'], { page: page, fields: ['id', 'title', 'mime']});
let response = await api.get(['resources'], { page: page, fields: ['id', 'title', 'mime', 'updated_time']});
console.log(`response: ${JSON.stringify(response)}`);
response.items.forEach(r => indexResource(r, resourceDir, db));
while (!!response.has_more) {
page += 1;
response = await api.get(['resources'], { page: page, fields: ['id', 'title', 'mime']});
response = await api.get(['resources'], { page: page, fields: ['id', 'title', 'mime', 'updated_time']});
console.log(`response: ${JSON.stringify(response)}`);
response.items.forEach(r => indexResource(r, resourceDir, db));
}
}

async function indexResource(resource: any, resourceDir: string, db: any) {
async function indexResource(resource: any, resourceDir: string, db: Database) {
console.log(`index ${JSON.stringify(resource)}`);
const lastIndexed = await query(db, 'SELECT index_time FROM index_time WHERE id = ?', resource.id);
console.log(`indexed=${lastIndexed}, updated=${resource.updated_time}`);
if (lastIndexed > resource.updated_time) {
console.log(`Skip indexing ${resource.id}/${resource.title}`);
return;
}
if (resource.mime === 'application/pdf') {
const fs = joplin.plugins.require('fs-extra'); // TODO import once

const text = await pdfToText(`${resourceDir}/${resource.id}.pdf`);
console.log(`extracted text from ${resource.title}: ${text.substring(0, 100)}`);

await db.run('INSERT INTO resources_fts VALUES(?, ?, ?)', resource.id, resource.title, text);
addToIndex(db, resource.title, resource.id, text);
}
}

Expand Down Expand Up @@ -108,9 +58,9 @@ async function onSearchResult(joplin: Joplin, dialog: ViewHandle, searchResult:

joplin.plugins.register({
onStart: async function() {
console.info('Plugin started!');
const profileDir = await joplin.plugins.dataDir();
const db = await initDb(profileDir);
try {
const dbPath = await joplin.plugins.dataDir();
const db = await initDb(dbPath, joplin.plugins.require('sqlite3'));

const resourceDir = await joplin.settings.globalValue('resourceDir');
await indexResources(joplin.data, resourceDir, db);
Expand Down Expand Up @@ -139,5 +89,6 @@ joplin.plugins.register({
},
})
await joplin.views.menuItems.create('Search in attachments', 'searchAttachments', MenuItemLocation.Edit);
} catch (e) { console.error(e)}
},
});
48 changes: 48 additions & 0 deletions src/index/pdf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import * as pdfjs from 'pdfjs-dist/es5/build/pdf.js';
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry';

pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorker;

async function pdfToText(path) {
const pdf = await pdfjs.getDocument(path).promise;
let complete = 0;
let total = pdf.numPages;
let pages = {};
for (let pagei = 1; pagei <= total; pagei++) {
const page = await pdf.getPage(pagei);
let pageNumber = page.pageNumber;
const textContent = await page.getTextContent();
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}

page_text += item.str;
last_item = item;
}
pages[pageNumber] = page_text + "\n\n";
}
++complete;
if (complete == total) {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
return full_text;
}
}
return '';
}

export { pdfToText };

0 comments on commit 42abbdb

Please sign in to comment.