diff --git a/eland/operations.py b/eland/operations.py index e6da9742a..2b71c07d5 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -1279,14 +1279,14 @@ def search_yield_pandas_dataframes( if sort_params: body["sort"] = [sort_params] - # i = 1 + i = 1 for hits in _search_yield_hits( query_compiler=query_compiler, body=body, max_number_of_hits=result_size, sort_index=sort_index ): df = query_compiler._es_results_to_pandas(hits) df = self._apply_df_post_processing(df, post_processing) - # df.to_csv(f'debug_{i}.csv') - # i += 1 + df.to_csv(f'debug_{i}.csv') + i += 1 yield df def index_count(self, query_compiler: "QueryCompiler", field: str) -> int: @@ -1566,31 +1566,31 @@ def _search_yield_hits( # to be the last sort value for this set of hits. body["search_after"] = hits[-1]["sort"] -# if __name__ == "__main__": -# import eland as ed -# from opensearchpy import OpenSearch -# -# -# # try connecting to an actual cluster at some point -# def get_os_client(cluster_url='https://localhost:9200', -# username='admin', -# password='admin'): -# ''' -# Get OpenSearch client -# :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443 -# :return: OpenSearch client -# ''' -# client = OpenSearch( -# hosts=[cluster_url], -# http_auth=(username, password), -# verify_certs=False -# ) -# return client -# -# client = get_os_client() -# ed_df = ed.DataFrame(client, 'sagemaker_demo_data') -# -# indices = [index for index, _ in ed_df.iterrows('_doc')] -# print(len(set(indices))) -# -# pass \ No newline at end of file +if __name__ == "__main__": + import eland as ed + from opensearchpy import OpenSearch + + + # try connecting to an actual cluster at some point + def get_os_client(cluster_url='https://localhost:9200', + username='admin', + password='admin'): + ''' + Get OpenSearch client + :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443 + :return: OpenSearch client + ''' + client = OpenSearch( + hosts=[cluster_url], + http_auth=(username, password), + verify_certs=False + ) + return client + + client = get_os_client() + ed_df = ed.DataFrame(client, 'sagemaker_demo_data') + + indices = [index for index, _ in ed_df.iterrows('_doc')] + print(len(set(indices))) + + pass \ No newline at end of file