Skip to content

Latest commit

 

History

History

faiss

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 
 
 

Use in conjunction with FAISS

Faiss is a library for efficient similarity search and clustering of dense vectors, made by Meta AI research.

The following extended example parses arXiv metadata out of a json file arxiv-metadata-10000.json. It has 10000 lines, each of which is a json object containing metadata for a paper on arXiv. For example, the first line is (after formatting for easy reading):

{
  "id": "0704.0001",
  "submitter": "Pavel Nadolsky",
  "authors": "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  "title": "Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies",
  "comments": "37 pages, 15 figures; published version",
  "journal-ref": "Phys.Rev.D76:013009,2007",
  "doi": "10.1103/PhysRevD.76.013009",
  "report-no": "ANL-HEP-PR-07-12",
  "categories": "hep-ph",
  "license": null,
  "abstract": "  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified ...",
  "versions": [ { "version": "v1", "created": "Mon, 2 Apr 2007 19:18:42 GMT" }, { "version": "v2", "created": "Tue, 24 Jul 2007 20:10:27 GMT" } ],
  "update_date": "2008-11-26",
  "authors_parsed": [ [ "Balázs", "C.", "" ], [ "Berger", "E. L.", "" ], [ "Nadolsky", "P. M.", "" ], [ "Yuan", "C. -P.", "" ] ]}

The entire Arxiv has 1.7 million papers, found in arXiv Dataset | Kaggle.

from InstructorEmbedding import INSTRUCTOR
model_ins = INSTRUCTOR('hkunlp/instructor-base')

sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
instruction = "Represent the Science title:"
embeddings = model_ins.encode([[instruction,sentence]])
print(embeddings.shape)

# --------------------------------------------------------------------------------

import json
import numpy as np
import faiss
import torch

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

# FAISS index setup
dimension = 768  # Instructor-XL output dimension
index_ins = faiss.IndexFlatL2(dimension)

# Extract and vectorize data
db_filename = 'arxiv-metadata-10000.json'
num_lines = 10000
batch_size = 4

# Load all papers from JSON
with open(db_filename, 'r') as f:
    papers = [json.loads(line) for line in f]

# Extract the papers' titles and abstracts
texts = [f"{paper['title']}: {paper['abstract']}" for paper in papers]

# Preparation for encoding
instructions = ["Represent the science titles and abstracts: "] * len(texts)

# Prepare the inputs
inputs = [[instr, txt] for instr, txt in zip(instructions, texts)]

# Create vectors using Instructor
vectors = model_ins.encode(
    sentences=inputs[:num_lines],
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    device=str(device)
)

# Add the vectors to the FAISS index
index_ins.add(np.array(vectors).astype('float32'))

print(f"Added {num_lines} papers to the FAISS index.")

# --------------------------------------------------------------------------------

def search_ins(query, k=5):
    vector = model_ins.encode(["Represent the query to a science database: ", query])
    _, indices = index_ins.search(np.array(vector[1]).reshape(1, -1).astype('float32'), k)
    return indices[0]

for query in queries:
    print(f"Question: {query}\n")
    line_numbers = search_ins(query, k=2)
    print_paper_details(line_numbers)
    print('-'*80)