Skip to content

Commit

Permalink
WIP - parses pdf and populates the database
Browse files Browse the repository at this point in the history
  • Loading branch information
roman-r-m committed Jan 27, 2021
1 parent 546373c commit f92b7c6
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 19 deletions.
8 changes: 4 additions & 4 deletions api/JoplinCommands.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import { Command } from './types';
* and look at the `execute()` command.
*/
export default class JoplinCommands {
/**
/**
* <span class="platform-desktop">desktop</span> Executes the given
* command.
*
Expand All @@ -40,8 +40,8 @@ export default class JoplinCommands {
* await joplin.commands.execute('newFolder', "SOME_FOLDER_ID");
* ```
*/
execute(commandName: string, ...args: any[]): Promise<any | void>;
/**
execute(commandName: string, ...args: any[]): Promise<any | void>;
/**
* <span class="platform-desktop">desktop</span> Registers a new command.
*
* ```typescript
Expand All @@ -57,5 +57,5 @@ export default class JoplinCommands {
* });
* ```
*/
register(command: Command): Promise<void>;
register(command: Command): Promise<void>;
}
4 changes: 2 additions & 2 deletions api/JoplinFilters.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
* so for now disable filters.
*/
export default class JoplinFilters {
on(name: string, callback: Function): Promise<void>;
off(name: string, callback: Function): Promise<void>;
on(name: string, callback: Function): Promise<void>;
off(name: string, callback: Function): Promise<void>;
}
4 changes: 2 additions & 2 deletions api/JoplinInterop.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ import { ExportModule, ImportModule } from './types';
* You may also want to refer to the Joplin API documentation to see the list of properties for each item (note, notebook, etc.) - https://joplinapp.org/api/references/rest_api/
*/
export default class JoplinInterop {
registerExportModule(module: ExportModule): Promise<void>;
registerImportModule(module: ImportModule): Promise<void>;
registerExportModule(module: ExportModule): Promise<void>;
registerImportModule(module: ImportModule): Promise<void>;
}
17 changes: 17 additions & 0 deletions api/JoplinPlugins.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,21 @@ export default class JoplinPlugins {
* @deprecated Use joplin.contentScripts.register()
*/
registerContentScript(type: ContentScriptType, id: string, scriptPath: string): Promise<void>;
/**
* Gets the plugin own data directory path. Use this to store any plugin-related data.
*/
dataDir(): Promise<string>;
/**
* It is not possible to bundle native packages with a plugin, because they
* need to work cross-platforms. Instead access to certain useful native
* packages is provided using this function.
*
* Currently these packages are available:
*
* - [sqlite3](https://www.npmjs.com/package/sqlite3)
* - [fs-extra](https://www.npmjs.com/package/fs-extra)
*
* [View the demo plugin](https://github.com/laurent22/joplin/tree/dev/packages/app-cli/tests/support/plugins/nativeModule)
*/
require(_path: string): any;
}
2 changes: 1 addition & 1 deletion api/JoplinSettings.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export interface ChangeEvent {
*/
keys: string[];
}
export declare type ChangeHandler = (event: ChangeEvent)=> void;
export declare type ChangeHandler = (event: ChangeEvent) => void;
/**
* This API allows registering new settings and setting sections, as well as getting and setting settings. Once a setting has been registered it will appear in the config screen and be editable by the user.
*
Expand Down
6 changes: 3 additions & 3 deletions api/JoplinWorkspace.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { Disposable } from './types';
declare enum ItemChangeEventType {
Create = 1,
Update = 2,
Delete = 3,
Delete = 3
}
interface ItemChangeEvent {
id: string;
Expand All @@ -12,8 +12,8 @@ interface ItemChangeEvent {
interface SyncStartEvent {
withErrors: boolean;
}
declare type ItemChangeHandler = (event: ItemChangeEvent)=> void;
declare type SyncStartHandler = (event: SyncStartEvent)=> void;
declare type ItemChangeHandler = (event: ItemChangeEvent) => void;
declare type SyncStartHandler = (event: SyncStartEvent) => void;
/**
* The workspace service provides access to all the parts of Joplin that
* are being worked on - i.e. the currently selected notes or notebooks as
Expand Down
49 changes: 45 additions & 4 deletions api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -330,16 +330,57 @@ export enum SettingItemType {
export interface SettingItem {
value: any;
type: SettingItemType;
public: boolean;
label: string;

label: string;
description?: string;
isEnum?: boolean;

/**
* A public setting will appear in the Configuration screen and will be
* modifiable by the user. A private setting however will not appear there,
* and can only be changed programmatically. You may use this to store some
* values that you do not want to directly expose.
*/
public: boolean;

/**
* You would usually set this to a section you would have created
* specifically for the plugin.
*/
section?: string;
options?: any;

/**
* To create a setting with multiple options, set this property to `true`.
* That setting will render as a dropdown list in the configuration screen.
*/
isEnum?: boolean;

/**
* This property is required when `isEnum` is `true`. In which case, it
* should contain a map of value => label.
*/
options?: Record<any, any>;

/**
* Reserved property. Not used at the moment.
*/
appTypes?: string[];

/**
* Set this to `true` to store secure data, such as passwords. Any such
* setting will be stored in the system keychain if one is available.
*/
secure?: boolean;

/**
* An advanced setting will be moved under the "Advanced" button in the
* config screen.
*/
advanced?: boolean;

/**
* Set the min, max and step values if you want to restrict an int setting
* to a particular range.
*/
minimum?: number;
maximum?: number;
step?: number;
Expand Down
9 changes: 7 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,30 @@
"version": "1.0.0",
"scripts": {
"dist": "webpack --joplin-plugin-config buildMain && webpack --joplin-plugin-config buildExtraScripts && webpack --joplin-plugin-config createArchive",
"prepare": "npm run dist",
"update": "npm install -g generator-joplin && yo joplin --update"
"prepare": "npm run dist"
},
"license": "MIT",
"keywords": [
"joplin-plugin"
],
"devDependencies": {
"@types/node": "^14.0.14",
"canvas": "^2.6.1",
"chalk": "^4.1.0",
"copy-webpack-plugin": "^6.1.0",
"fs-extra": "^9.0.1",
"glob": "^7.1.6",
"node-loader": "^1.0.2",
"on-build-webpack": "^0.1.0",
"tar": "^6.0.5",
"ts-loader": "^7.0.5",
"typescript": "^3.9.3",
"webpack": "^4.43.0",
"webpack-cli": "^3.3.11",
"yargs": "^16.2.0"
},
"dependencies": {
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^2.6.347"
}
}
112 changes: 111 additions & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,117 @@
import joplin from 'api';
import JoplinData from 'api/JoplinData';

async function extractText(path: string): Promise<string> {
return new Promise((resolve, reject) => {
pdfToText(path, resolve);
});
}

async function pdfToText(path, callbackAllDone) {
let PDFJS = require('pdfjs-dist/es5/build/pdf.js');
const pdfjsWorker = await import('pdfjs-dist/build/pdf.worker.entry');
PDFJS.GlobalWorkerOptions.workerSrc = pdfjsWorker;

PDFJS.getDocument(path).promise.then(function(pdf) {
let complete = 0;
let total = pdf.numPages;
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace

page_text += item.str;
last_item = item;
} // ends for every item of text

textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items

++complete;

// If all done, put pages in order and combine all
// text, then pass that to the callback
if (complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}

async function initDb(path: string) {
console.log('init db');
const sqlite3 = joplin.plugins.require('sqlite3');
console.log('imported module: ' + typeof sqlite3);
const db = new sqlite3.Database(`${path}/resource.sqlite`);
console.log('created db: ' + typeof db);
await db.run('DROP TABLE resources_fts');
await db.run('CREATE VIRTUAL TABLE IF NOT EXISTS resources_fts USING fts5(id, title, text)');
return db;
}

async function indexResources(api: JoplinData, resourceDir: string, db: any) {
const resources: any[] = (await api.get(['resources'], { fields: ['id', 'title', 'mime']})).items;
resources.forEach(r => indexResource(r, resourceDir, db));
}

async function indexResource(resource: any, resourceDir: string, db: any) {
console.log(`index ${JSON.stringify(resource)}`);
if (resource.mime === 'application/pdf') {
const fs = joplin.plugins.require('fs-extra'); // TODO import once
// const pdf = require('pdf-parse');
// const buffer = fs.readFileSync(`${resourceDir}/${resource.id}.pdf`);
// const data = await pdf(buffer);
// console.log(`parsed pdf: ${JSON.stringify(data)}`);

const text = await extractText(`${resourceDir}/${resource.id}.pdf`);
console.log(`extracted text from ${resource.title}: ${text.substring(0, 100)}`);

await db.run('INSERT INTO resources_fts VALUES(?, ?, ?)', resource.id, resource.title, text);
}
}

joplin.plugins.register({
onStart: async function() {
console.info('Test plugin started!');
console.info('Plugin started!');
const profileDir = await joplin.settings.globalValue('profileDir');
const resourceDir = await joplin.settings.globalValue('resourceDir');
console.log(`the profile is in ${profileDir}`);

const db = await initDb(profileDir);

await indexResources(joplin.data, resourceDir, db);
},
});
8 changes: 8 additions & 0 deletions webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ const baseConfig = {
use: 'ts-loader',
exclude: /node_modules/,
},
{
test: /\.node$/,
loader: 'node-loader',
},
],
},
};
Expand Down Expand Up @@ -178,6 +182,10 @@ const createArchiveConfig = {
filename: 'index.js',
path: publishDir,
},
node: {
fs: "empty",
canvas: "empty"
},
plugins: [new WebpackOnBuildPlugin(onBuildCompleted)],
};

Expand Down

0 comments on commit f92b7c6

Please sign in to comment.