-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
108 lines (96 loc) · 4.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import glob
import os
from flask import Flask, redirect, render_template, request, url_for
from flask_json import FlaskJSON, JsonError, json_response, as_json
import ast # for converting embeddings saved as strings back to arrays
import openai # for calling the OpenAI API
import pandas as pd # for storing text and embeddings data
import tiktoken # for counting tokens
from scipy import spatial # for calculating vector similarities for search
from pprint import pprint
import clickhouse_connect
import numpy as np
app = Flask(__name__)
FlaskJSON(app)
openai.api_key = os.getenv("OPENAI_API_KEY")
# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
client = clickhouse_connect.get_client(host= os.environ.get('CLICKHOUSE_HOST', 'localhost'),
database= os.environ.get('CLICKHOUSE_DATABASE', 'openai'),
username=os.environ.get('CLICKHOUSE_USERNAME', 'default'),
password=os.environ.get('CLICKHOUSE_PASSWORD', ''),
port=os.environ.get('CLICKHOUSE_PORT', 8123))
def get_embedding_for(text: str, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
def save_adress_embedding_to_csv():
file_list = glob.glob("data/z*.txt")
data_list=[]
if file_list:
pprint("****** The list of files *********")
pprint(file_list)
for a_file_name in file_list:
with open(a_file_name,'r') as f:
# read first line that is address for embeding
address = f.readline()
# read rest of the lines that is description from zillow page
description = f.read()
pprint("address -----------> ")
data_list.append({"address": address.strip(), "description": description.strip()})
df = pd.DataFrame.from_records(data=data_list)
df["adress_embedding"] = df.address.apply(lambda address: get_embedding_for(address))
pprint(df.head(5))
df.to_csv("address_embeddings_descriptions.csv")
def save_table_to_db():
df = pd.read_csv("addess_embeddings_descriptions.csv.gz")
df["address_embeddings"] = df.adress_embedding.apply(lambda string_embedings: np.asarray(ast.literal_eval(string_embedings), dtype='float32'))
df= df.drop(["adress_embedding","Unnamed: 0"], axis=1)
client.insert_df("qa_properties", df)
pprint(df.head(2))
return df
def do_qa(a_question):
pprint(a_question)
#1. get embedings for question
embedings_of_the_question = get_embedding_for(a_question)
#pprint(embedings_of_the_question)
#2. TODO search a DB key, embedings of which is closest
parameters = {'question_embedings': embedings_of_the_question }
df = client.query_df('SELECT min(cosineDistance(address_embeddings, {question_embedings:Array(Float32)})) as distance, description FROM qa_properties group by description order by 1 asc limit 1',parameters=parameters)
dist= df.iloc[[0]][['distance']]
desc= df.iloc[[0]][['description']]
#pprint(desc)
query = f"""Use the below description of real estate property to answer the subsequent question. If the answer cannot be found, write "Not possible to answer"
Description of real estate property:
\"\"\"
{desc}
\"\"\"
Question: {a_question}?"""
response = openai.ChatCompletion.create(
messages=[
{'role': 'system', 'content': 'You answer questions about real estate property.'},
{'role': 'user', 'content': query},
],
model=GPT_MODEL,
temperature=0,
)
answer= response['choices'][0]['message']['content']
pprint(answer)
return answer
@app.route("/", methods=("GET", "POST"))
def index():
if request.method == "POST":
a_question = request.form["question"]
answer = do_qa(a_question)
return redirect(url_for("index", result= answer))
result = request.args.get("result")
return render_template("index.html", result=result)
@app.route('/property_data', methods=['POST'])
def property_data():
# We use 'force' to skip mimetype checking to have shorter curl command.
data = request.get_json(force=True)
try:
answer = do_qa(data['question'])
except (KeyError, TypeError, ValueError):
raise JsonError(description='Invalid value.')
return json_response(answer= answer)