Skip to content

Commit

Permalink
fix: add serializer in doc store (#1243)
Browse files Browse the repository at this point in the history
Co-authored-by: Alex Yang <himself65@outlook.com>
  • Loading branch information
gorango and himself65 authored Sep 23, 2024
1 parent bdc4bfe commit 23bcc37
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 13 deletions.
7 changes: 7 additions & 0 deletions .changeset/gorgeous-bees-hide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"llamaindex": patch
---

fix: add `serializer` in doc store

`PostgresDocumentStore` now will not use JSON.stringify for better performance
12 changes: 9 additions & 3 deletions packages/llamaindex/src/ingestion/IngestionCache.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import type { BaseNode, TransformComponent } from "@llamaindex/core/schema";
import { MetadataMode } from "@llamaindex/core/schema";
import { createSHA256 } from "@llamaindex/env";
import { docToJson, jsonToDoc } from "../storage/docStore/utils.js";
import {
docToJson,
jsonSerializer,
jsonToDoc,
} from "../storage/docStore/utils.js";
import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js";
import type { BaseKVStore } from "../storage/kvStore/types.js";

Expand Down Expand Up @@ -53,7 +57,7 @@ export class IngestionCache {

async put(hash: string, nodes: BaseNode[]) {
const val = {
[this.nodesKey]: nodes.map((node) => docToJson(node)),
[this.nodesKey]: nodes.map((node) => docToJson(node, jsonSerializer)),
};
await this.cache.put(hash, val, this.collection);
}
Expand All @@ -63,6 +67,8 @@ export class IngestionCache {
if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) {
return undefined;
}
return json[this.nodesKey].map((doc: any) => jsonToDoc(doc));
return json[this.nodesKey].map((doc: any) =>
jsonToDoc(doc, jsonSerializer),
);
}
}
6 changes: 3 additions & 3 deletions packages/llamaindex/src/storage/docStore/KVDocumentStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export class KVDocumentStore extends BaseDocumentStore {
for (const key in jsonDict) {
const value = jsonDict[key];
if (isValidDocJson(value)) {
docs[key] = jsonToDoc(value);
docs[key] = jsonToDoc(value, this.serializer);
} else {
console.warn(`Invalid JSON for docId ${key}`);
}
Expand All @@ -52,7 +52,7 @@ export class KVDocumentStore extends BaseDocumentStore {
);
}
const nodeKey = doc.id_;
const data = docToJson(doc);
const data = docToJson(doc, this.serializer);
await this.kvstore.put(nodeKey, data, this.nodeCollection);
const metadata: DocMetaData = { docHash: doc.hash };

Expand Down Expand Up @@ -94,7 +94,7 @@ export class KVDocumentStore extends BaseDocumentStore {
if (!isValidDocJson(json)) {
throw new Error(`Invalid JSON for docId ${docId}`);
}
return jsonToDoc(json);
return jsonToDoc(json, this.serializer);
}

async getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
type PostgresKVStoreConfig,
} from "../kvStore/PostgresKVStore.js";
import { KVDocumentStore } from "./KVDocumentStore.js";
import { noneSerializer } from "./utils.js";

const DEFAULT_TABLE_NAME = "llamaindex_doc_store";

Expand All @@ -12,6 +13,8 @@ export type PostgresDocumentStoreConfig = PostgresKVStoreConfig & {
};

export class PostgresDocumentStore extends KVDocumentStore {
serializer = noneSerializer;

constructor(config?: PostgresDocumentStoreConfig) {
const kvStore = new PostgresKVStore({
schemaName: config?.schemaName,
Expand Down
3 changes: 3 additions & 0 deletions packages/llamaindex/src/storage/docStore/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
DEFAULT_PERSIST_DIR,
} from "@llamaindex/core/global";
import { BaseNode } from "@llamaindex/core/schema";
import { jsonSerializer, type Serializer } from "./utils.js";

const defaultPersistPath = `${DEFAULT_PERSIST_DIR}/${DEFAULT_DOC_STORE_PERSIST_FILENAME}`;

Expand All @@ -12,6 +13,8 @@ export interface RefDocInfo {
}

export abstract class BaseDocumentStore {
serializer: Serializer<any> = jsonSerializer;

// Save/load
persist(persistPath: string = defaultPersistPath): void {
// Persist the docstore to a file.
Expand Down
43 changes: 36 additions & 7 deletions packages/llamaindex/src/storage/docStore/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,35 @@ import { Document, ObjectType, TextNode } from "@llamaindex/core/schema";
const TYPE_KEY = "__type__";
const DATA_KEY = "__data__";

type DocJson = {
export interface Serializer<T> {
toPersistence(data: Record<string, unknown>): T;
fromPersistence(data: T): Record<string, unknown>;
}

export const jsonSerializer: Serializer<string> = {
toPersistence(data) {
return JSON.stringify(data);
},
fromPersistence(data) {
return JSON.parse(data);
},
};

export const noneSerializer: Serializer<Record<string, unknown>> = {
toPersistence(data) {
return data;
},
fromPersistence(data) {
return data;
},
};

type DocJson<Data> = {
[TYPE_KEY]: ObjectType;
[DATA_KEY]: string;
[DATA_KEY]: Data;
};

export function isValidDocJson(docJson: any): docJson is DocJson {
export function isValidDocJson(docJson: any): docJson is DocJson<unknown> {
return (
typeof docJson === "object" &&
docJson !== null &&
Expand All @@ -18,16 +41,22 @@ export function isValidDocJson(docJson: any): docJson is DocJson {
);
}

export function docToJson(doc: BaseNode): DocJson {
export function docToJson(
doc: BaseNode,
serializer: Serializer<unknown>,
): DocJson<unknown> {
return {
[DATA_KEY]: JSON.stringify(doc.toJSON()),
[DATA_KEY]: serializer.toPersistence(doc.toJSON()),
[TYPE_KEY]: doc.type,
};
}

export function jsonToDoc(docDict: DocJson): BaseNode {
export function jsonToDoc<Data>(
docDict: DocJson<Data>,
serializer: Serializer<Data>,
): BaseNode {
const docType = docDict[TYPE_KEY];
const dataDict = JSON.parse(docDict[DATA_KEY]);
const dataDict = serializer.fromPersistence(docDict[DATA_KEY]) as any;
let doc: BaseNode;

if (docType === ObjectType.DOCUMENT) {
Expand Down

0 comments on commit 23bcc37

Please sign in to comment.