Skip to content

Very long time taken inserting data #1161

Open
@pLazy

Description

@pLazy

When trying to insert the data in my big query tables, it takes forever. In the following code, I insert one Query with 25 query documents referring to it, and it takes around 70 seconds if I do it from the sqlalchemy, but if I insert them via the big query client, it takes less than a second:

        
        # %%
        import uuid
        from sqlalchemy import create_engine
        from sqlalchemy import Column, ForeignKey, String, TIMESTAMP
        from sqlalchemy.ext.declarative import declarative_base
        
        import time
        from sqlalchemy.orm import Session
        
        import datetime
        
        # Create engine
        path = "path to my big query dataset"
        engine = create_engine(
            path,
            pool_size=100000,
            max_overflow=100000,
            query_cache_size=100000,
            connect_args={},
            execution_options={
                "fast_executemany": True,
                "fast_executemany_batch_size": 100000,
                "insertmanyvalues_page_size": 100000,  # Optimize batch size
                "enable_parallel_execution": True,  # Enable parallel execution where possible
            },
            echo=True,
        )
        
        # %%
        #create tables
        QUERY_TABLE_NAME = "query_log_entries"
        QUERY_RESULT_DOCUMENT_TABLE_NAME = "query_result_documents"
        
        Base = declarative_base()
        
        
        
        class QueryLog(Base):
            __tablename__ = QUERY_TABLE_NAME
        
            query_id = Column(String, primary_key=True)
            timestamp = Column(TIMESTAMP)
            git_hash = Column(String)
        
        
        
        class QueryResultDocument(Base):
            __tablename__ = QUERY_RESULT_DOCUMENT_TABLE_NAME
        
            query_id_ref = Column(String, ForeignKey(f"{QUERY_TABLE_NAME}.query_id"))
            id = Column(String, primary_key=True)
            document_id = Column(String)
        
        
        # %%
        
        id = str(uuid.uuid4())
        # Create a sample query log entry
        sample_query_log = QueryLog(
            query_id=id,
            timestamp=datetime.datetime.now(),
            git_hash="abc123"
        
        )
        
        # %%
        
        
        
        # %%
        docs = []
        for i in range(25):
            sample_query_result_doc = QueryResultDocument(
                query_id_ref=id,  # Matches the query_id from sample_query_log
                id=str(uuid.uuid4()),
                document_id="policy_doc_456",
            )
            docs.append(sample_query_result_doc)
        
        
        # Create all tables if they don't exist
        Base.metadata.create_all(engine)
        
        # %% 
        
        #Adding the data to the tables via sqlalchemy
        start_time = time.time()
        with Session(engine) as session:
            session.add(sample_query_log)
            session.bulk_save_objects(docs)
            session.commit()
        
        
        end_time = time.time()
        print(f"Time taken: {end_time - start_time:.2f} seconds")
        
        # %%
        from google.cloud import bigquery
        
        # Initialize BigQuery client
        client = bigquery.Client(project='project')
        
        #Adding the data to the tables via bigquery
        start_time = time.time()
        client.insert_rows_json(f".{QUERY_TABLE_NAME}", [{
            'query_id': sample_query_log.query_id,
            'timestamp': sample_query_log.timestamp.isoformat(),
            'git_hash': sample_query_log.git_hash
        }])
        
        
        client.insert_rows_json(f"project.{QUERY_RESULT_DOCUMENT_TABLE_NAME}", [{
            'query_id_ref': doc.query_id_ref,
            'id': doc.id,
            'document_id': doc.document_id
        } for doc in docs])
        
        end_time = time.time()
        print(f"Time taken: {end_time - start_time:.2f} seconds")
        
        # %%

Do you know what is the reason for such a huge performance difference?

Metadata

Metadata

Assignees

Labels

api: bigqueryIssues related to the googleapis/python-bigquery-sqlalchemy API.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions