|
1 | 1 | import functools
|
2 | 2 | import time
|
3 |
| -from multiprocessing import get_context |
| 3 | +from multiprocessing import Process, Queue |
4 | 4 | from typing import Iterable, List, Optional, Tuple
|
| 5 | +from itertools import islice |
5 | 6 |
|
6 | 7 | import numpy as np
|
7 | 8 | import tqdm
|
@@ -83,53 +84,118 @@ def search_all(
|
83 | 84 |
|
84 | 85 | # Handle num_queries parameter
|
85 | 86 | if num_queries > 0:
|
86 |
| - # If we need more queries than available, cycle through the list |
| 87 | + # If we need more queries than available, use a cycling generator |
87 | 88 | if num_queries > len(queries_list) and len(queries_list) > 0:
|
88 | 89 | print(f"Requested {num_queries} queries but only {len(queries_list)} are available.")
|
89 |
| - print(f"Extending queries by cycling through the available ones.") |
90 |
| - # Calculate how many complete cycles and remaining items we need |
91 |
| - complete_cycles = num_queries // len(queries_list) |
92 |
| - remaining = num_queries % len(queries_list) |
93 |
| - |
94 |
| - # Create the extended list |
95 |
| - extended_queries = [] |
96 |
| - for _ in range(complete_cycles): |
97 |
| - extended_queries.extend(queries_list) |
98 |
| - extended_queries.extend(queries_list[:remaining]) |
99 |
| - |
100 |
| - used_queries = extended_queries |
| 90 | + print(f"Using a cycling generator to efficiently process queries.") |
| 91 | + |
| 92 | + # Create a cycling generator function |
| 93 | + def cycling_query_generator(queries, total_count): |
| 94 | + """Generate queries by cycling through the available ones.""" |
| 95 | + count = 0 |
| 96 | + while count < total_count: |
| 97 | + for query in queries: |
| 98 | + if count < total_count: |
| 99 | + yield query |
| 100 | + count += 1 |
| 101 | + else: |
| 102 | + break |
| 103 | + |
| 104 | + # Use the generator instead of creating a full list |
| 105 | + used_queries = cycling_query_generator(queries_list, num_queries) |
| 106 | + # We need to know the total count for the progress bar |
| 107 | + total_query_count = num_queries |
101 | 108 | else:
|
102 | 109 | used_queries = queries_list[:num_queries]
|
| 110 | + total_query_count = len(used_queries) |
103 | 111 | print(f"Using {num_queries} queries")
|
104 | 112 | else:
|
105 | 113 | used_queries = queries_list
|
| 114 | + total_query_count = len(used_queries) |
106 | 115 |
|
107 | 116 | if parallel == 1:
|
| 117 | + # Single-threaded execution |
108 | 118 | start = time.perf_counter()
|
109 |
| - precisions, latencies = list( |
110 |
| - zip(*[search_one(query) for query in tqdm.tqdm(used_queries)]) |
111 |
| - ) |
| 119 | + |
| 120 | + # Create a progress bar with the correct total |
| 121 | + pbar = tqdm.tqdm(total=total_query_count, desc="Processing queries", unit="queries") |
| 122 | + |
| 123 | + # Process queries with progress updates |
| 124 | + results = [] |
| 125 | + for query in used_queries: |
| 126 | + results.append(search_one(query)) |
| 127 | + pbar.update(1) |
| 128 | + |
| 129 | + # Close the progress bar |
| 130 | + pbar.close() |
| 131 | + |
| 132 | + total_time = time.perf_counter() - start |
112 | 133 | else:
|
113 |
| - ctx = get_context(self.get_mp_start_method()) |
| 134 | + # Dynamically calculate chunk size based on total_query_count |
| 135 | + chunk_size = max(1, total_query_count // parallel) |
| 136 | + |
| 137 | + # If used_queries is a generator, we need to handle it differently |
| 138 | + if hasattr(used_queries, '__next__'): |
| 139 | + # For generators, we'll create chunks on-the-fly |
| 140 | + query_chunks = [] |
| 141 | + remaining = total_query_count |
| 142 | + while remaining > 0: |
| 143 | + current_chunk_size = min(chunk_size, remaining) |
| 144 | + chunk = [next(used_queries) for _ in range(current_chunk_size)] |
| 145 | + query_chunks.append(chunk) |
| 146 | + remaining -= current_chunk_size |
| 147 | + else: |
| 148 | + # For lists, we can use the chunked_iterable function |
| 149 | + query_chunks = list(chunked_iterable(used_queries, chunk_size)) |
114 | 150 |
|
115 |
| - with ctx.Pool( |
116 |
| - processes=parallel, |
117 |
| - initializer=self.__class__.init_client, |
118 |
| - initargs=( |
| 151 | + # Function to be executed by each worker process |
| 152 | + def worker_function(chunk, result_queue): |
| 153 | + self.__class__.init_client( |
119 | 154 | self.host,
|
120 | 155 | distance,
|
121 | 156 | self.connection_params,
|
122 | 157 | self.search_params,
|
123 |
| - ), |
124 |
| - ) as pool: |
125 |
| - if parallel > 10: |
126 |
| - time.sleep(15) # Wait for all processes to start |
127 |
| - start = time.perf_counter() |
128 |
| - precisions, latencies = list( |
129 |
| - zip(*pool.imap_unordered(search_one, iterable=tqdm.tqdm(used_queries))) |
130 | 158 | )
|
| 159 | + self.setup_search() |
| 160 | + results = process_chunk(chunk, search_one) |
| 161 | + result_queue.put(results) |
| 162 | + |
| 163 | + # Create a queue to collect results |
| 164 | + result_queue = Queue() |
| 165 | + |
| 166 | + # Create and start worker processes |
| 167 | + processes = [] |
| 168 | + for chunk in query_chunks: |
| 169 | + process = Process(target=worker_function, args=(chunk, result_queue)) |
| 170 | + processes.append(process) |
| 171 | + process.start() |
| 172 | + |
| 173 | + # Start measuring time for the critical work |
| 174 | + start = time.perf_counter() |
| 175 | + |
| 176 | + # Create a progress bar for the total number of queries |
| 177 | + pbar = tqdm.tqdm(total=total_query_count, desc="Processing queries", unit="queries") |
131 | 178 |
|
132 |
| - total_time = time.perf_counter() - start |
| 179 | + # Collect results from all worker processes |
| 180 | + results = [] |
| 181 | + for _ in processes: |
| 182 | + chunk_results = result_queue.get() |
| 183 | + results.extend(chunk_results) |
| 184 | + # Update the progress bar with the number of processed queries in this chunk |
| 185 | + pbar.update(len(chunk_results)) |
| 186 | + |
| 187 | + # Close the progress bar |
| 188 | + pbar.close() |
| 189 | + |
| 190 | + # Wait for all worker processes to finish |
| 191 | + for process in processes: |
| 192 | + process.join() |
| 193 | + |
| 194 | + # Stop measuring time for the critical work |
| 195 | + total_time = time.perf_counter() - start |
| 196 | + |
| 197 | + # Extract precisions and latencies (outside the timed section) |
| 198 | + precisions, latencies = zip(*results) |
133 | 199 |
|
134 | 200 | self.__class__.delete_client()
|
135 | 201 |
|
@@ -157,3 +223,21 @@ def post_search(self):
|
157 | 223 | @classmethod
|
158 | 224 | def delete_client(cls):
|
159 | 225 | pass
|
| 226 | + |
| 227 | + |
| 228 | +def chunked_iterable(iterable, size): |
| 229 | + """Yield successive chunks of a given size from an iterable.""" |
| 230 | + it = iter(iterable) |
| 231 | + while chunk := list(islice(it, size)): |
| 232 | + yield chunk |
| 233 | + |
| 234 | + |
| 235 | +def process_chunk(chunk, search_one): |
| 236 | + """Process a chunk of queries using the search_one function.""" |
| 237 | + # No progress bar in worker processes to avoid cluttering the output |
| 238 | + return [search_one(query) for query in chunk] |
| 239 | + |
| 240 | + |
| 241 | +def process_chunk_wrapper(chunk, search_one): |
| 242 | + """Wrapper to process a chunk of queries.""" |
| 243 | + return process_chunk(chunk, search_one) |
0 commit comments