Open
Description
When trying to insert the data in my big query tables, it takes forever. In the following code, I insert one Query with 25 query documents referring to it, and it takes around 70 seconds if I do it from the sqlalchemy, but if I insert them via the big query client, it takes less than a second:
# %%
import uuid
from sqlalchemy import create_engine
from sqlalchemy import Column, ForeignKey, String, TIMESTAMP
from sqlalchemy.ext.declarative import declarative_base
import time
from sqlalchemy.orm import Session
import datetime
# Create engine
path = "path to my big query dataset"
engine = create_engine(
path,
pool_size=100000,
max_overflow=100000,
query_cache_size=100000,
connect_args={},
execution_options={
"fast_executemany": True,
"fast_executemany_batch_size": 100000,
"insertmanyvalues_page_size": 100000, # Optimize batch size
"enable_parallel_execution": True, # Enable parallel execution where possible
},
echo=True,
)
# %%
#create tables
QUERY_TABLE_NAME = "query_log_entries"
QUERY_RESULT_DOCUMENT_TABLE_NAME = "query_result_documents"
Base = declarative_base()
class QueryLog(Base):
__tablename__ = QUERY_TABLE_NAME
query_id = Column(String, primary_key=True)
timestamp = Column(TIMESTAMP)
git_hash = Column(String)
class QueryResultDocument(Base):
__tablename__ = QUERY_RESULT_DOCUMENT_TABLE_NAME
query_id_ref = Column(String, ForeignKey(f"{QUERY_TABLE_NAME}.query_id"))
id = Column(String, primary_key=True)
document_id = Column(String)
# %%
id = str(uuid.uuid4())
# Create a sample query log entry
sample_query_log = QueryLog(
query_id=id,
timestamp=datetime.datetime.now(),
git_hash="abc123"
)
# %%
# %%
docs = []
for i in range(25):
sample_query_result_doc = QueryResultDocument(
query_id_ref=id, # Matches the query_id from sample_query_log
id=str(uuid.uuid4()),
document_id="policy_doc_456",
)
docs.append(sample_query_result_doc)
# Create all tables if they don't exist
Base.metadata.create_all(engine)
# %%
#Adding the data to the tables via sqlalchemy
start_time = time.time()
with Session(engine) as session:
session.add(sample_query_log)
session.bulk_save_objects(docs)
session.commit()
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
# %%
from google.cloud import bigquery
# Initialize BigQuery client
client = bigquery.Client(project='project')
#Adding the data to the tables via bigquery
start_time = time.time()
client.insert_rows_json(f".{QUERY_TABLE_NAME}", [{
'query_id': sample_query_log.query_id,
'timestamp': sample_query_log.timestamp.isoformat(),
'git_hash': sample_query_log.git_hash
}])
client.insert_rows_json(f"project.{QUERY_RESULT_DOCUMENT_TABLE_NAME}", [{
'query_id_ref': doc.query_id_ref,
'id': doc.id,
'document_id': doc.document_id
} for doc in docs])
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
# %%
Do you know what is the reason for such a huge performance difference?