-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
126 lines (102 loc) · 3.49 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import itertools
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import typer
import pandas as pd
from git import Repo
from rich.progress import Progress
def batch(iterable, size):
sourceiter = iter(iterable)
while True:
batchiter = itertools.islice(sourceiter, size)
yield list(batchiter)
def read_file(path: str):
extension = os.path.splitext(path)[1]
match extension:
case ".csv":
return pd.read_csv(path)
case ".jsonl":
return pd.read_json(path, lines=True)
case ".xlsx":
return pd.read_excel(path)
case ".parquet":
return pd.read_parquet(path)
case _:
print(f"[!] File extension {extension} not supported")
raise typer.Exit(1)
def get_repo_num_commits(repo: Repo, before=None, after=None):
args = ["--count"]
if before:
args.append(f"--before={before}")
if after:
args.append(f"--after={after}")
args.append("HEAD")
return int(repo.git.rev_list(*args))
def read_lines(path: str, strip=False):
lines = []
with open(path, "r") as f:
lines = f.readlines()
if strip:
lines = [line.strip() for line in lines]
return lines
def clone_repos(
repo_urls: list[str],
directory: str,
num_workers: int,
summary_progress: Progress = None,
task_progress: Progress = None,
):
parsed_urls = [urlparse(url) for url in repo_urls]
clone_task = summary_progress.add_task(
"[red]Downloading Repos...",
total=len(repo_urls),
visible=len(repo_urls) > 0,
)
repos = []
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = {}
for url, parsed in zip(repo_urls, parsed_urls):
future = executor.submit(
Repo.clone_from,
url,
f"{directory}/{parsed.netloc}{parsed.path}",
)
task = task_progress.add_task(f"{url}", total=1)
futures[future] = (clone_task, task)
for future in as_completed(futures.keys()):
summary_task, spinner_task = futures[future]
summary_progress.update(summary_task, advance=1)
task_progress.update(spinner_task, completed=1)
task_progress.remove_task(spinner_task)
repo = future.result()
if isinstance(repo, Repo):
repos.append(repo)
return repos
def pull_repos(
repos: list[Repo],
num_workers: int,
summary_progress: Progress = None,
task_progress: Progress = None,
):
pull_task = summary_progress.add_task(
"[red]Pulling Repos...",
total=len(repos),
visible=len(repos) > 0,
)
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = {}
for repo in repos:
future = executor.submit(repo.remotes.origin.pull)
repo_name = os.path.basename(repo.working_dir)
task = task_progress.add_task(f"{repo_name}", total=1)
futures[future] = (pull_task, task)
for future in as_completed(futures.keys()):
summary_task, spinner_task = futures[future]
summary_progress.update(summary_task, advance=1)
task_progress.update(spinner_task, completed=1)
task_progress.remove_task(spinner_task)
repo = future.result()
if isinstance(repo, Repo):
repos.append(repo)
return repos