-
Notifications
You must be signed in to change notification settings - Fork 86
/
pull.py
319 lines (280 loc) · 11.8 KB
/
pull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import os
import subprocess
import logging
import time
import argparse
import datetime
from traitlets import Integer, default
from traitlets.config import Configurable
from functools import partial
def execute_cmd(cmd, **kwargs):
"""
Call given command, yielding output line by line
"""
yield '$ {}\n'.format(' '.join(cmd))
kwargs['stdout'] = subprocess.PIPE
kwargs['stderr'] = subprocess.STDOUT
proc = subprocess.Popen(cmd, **kwargs)
# Capture output for logging.
# Each line will be yielded as text.
# This should behave the same as .readline(), but splits on `\r` OR `\n`,
# not just `\n`.
buf = []
def flush():
line = b''.join(buf).decode('utf8', 'replace')
buf[:] = []
return line
c_last = ''
try:
for c in iter(partial(proc.stdout.read, 1), b''):
if c_last == b'\r' and buf and c != b'\n':
yield flush()
buf.append(c)
if c == b'\n':
yield flush()
c_last = c
finally:
ret = proc.wait()
if ret != 0:
raise subprocess.CalledProcessError(ret, cmd)
class GitPuller(Configurable):
depth = Integer(
config=True,
help="""
Depth (ie, commit count) of clone operations. Set this to 0 to make a
full depth clone.
Defaults to the value of the environment variable NBGITPULLER_DEPTH, or
1 if the the environment variable isn't set.
"""
)
@default('depth')
def _depth_default(self):
"""This is a workaround for setting the same default directly in the
definition of the traitlet above. Without it, the test fails because a
change in the environment variable has no impact. I think this is a
consequence of the tests not starting with a totally clean environment
where the GitPuller class hadn't been loaded already."""
return int(os.environ.get('NBGITPULLER_DEPTH', 1))
def __init__(self, git_url, repo_dir, **kwargs):
assert git_url
self.git_url = git_url
self.branch_name = kwargs.pop("branch")
if self.branch_name is None:
self.branch_name = self.resolve_default_branch()
elif not self.branch_exists(self.branch_name):
raise ValueError(f"Branch: {self.branch_name} -- not found in repo: {self.git_url}")
self.repo_dir = repo_dir
newargs = {k: v for k, v in kwargs.items() if v is not None}
super(GitPuller, self).__init__(**newargs)
def branch_exists(self, branch):
"""
This checks to make sure the branch we are told to access
exists in the repo
"""
try:
heads = subprocess.run(
["git", "ls-remote", "--heads", "--", self.git_url],
capture_output=True,
text=True,
check=True
)
tags = subprocess.run(
["git", "ls-remote", "--tags", "--", self.git_url],
capture_output=True,
text=True,
check=True
)
lines = heads.stdout.splitlines() + tags.stdout.splitlines()
branches = []
for line in lines:
_, ref = line.split()
refs, heads, branch_name = ref.split("/", 2)
branches.append(branch_name)
return branch in branches
except subprocess.CalledProcessError:
m = f"Problem accessing list of branches and/or tags: {self.git_url}"
logging.exception(m)
raise ValueError(m)
def resolve_default_branch(self):
"""
This will resolve the default branch of the repo in
the case where the branch given does not exist
"""
try:
head_branch = subprocess.run(
["git", "ls-remote", "--symref", "--", self.git_url, "HEAD"],
capture_output=True,
text=True,
check=True
)
for line in head_branch.stdout.splitlines():
if line.startswith("ref:"):
# line resembles --> ref: refs/heads/main HEAD
_, ref, head = line.split()
refs, heads, branch_name = ref.split("/", 2)
return branch_name
raise ValueError(f"default branch not found in {self.git_url}")
except subprocess.CalledProcessError:
m = f"Problem accessing HEAD branch: {self.git_url}"
logging.exception(m)
raise ValueError(m)
def pull(self):
"""
Pull selected repo from a remote git repository,
while preserving user changes
"""
if not os.path.exists(self.repo_dir):
yield from self.initialize_repo()
else:
yield from self.update()
def initialize_repo(self):
"""
Clones repository
"""
logging.info('Repo {} doesn\'t exist. Cloning...'.format(self.repo_dir))
clone_args = ['git', 'clone']
if self.depth and self.depth > 0:
clone_args.extend(['--depth', str(self.depth)])
clone_args.extend(['--branch', self.branch_name])
clone_args.extend(["--", self.git_url, self.repo_dir])
yield from execute_cmd(clone_args)
logging.info('Repo {} initialized'.format(self.repo_dir))
def reset_deleted_files(self):
"""
Runs the equivalent of git checkout -- <file> for each file that was
deleted. This allows us to delete a file, hit an interact link, then get a
clean version of the file again.
"""
yield from self.ensure_lock()
deleted_files = subprocess.check_output([
'git', 'ls-files', '--deleted', '-z'
], cwd=self.repo_dir).decode().strip().split('\0')
for filename in deleted_files:
if filename: # Filter out empty lines
yield from execute_cmd(['git', 'checkout', 'origin/{}'.format(self.branch_name), '--', filename], cwd=self.repo_dir)
def repo_is_dirty(self):
"""
Return true if repo is dirty
"""
try:
subprocess.check_call(['git', 'diff-files', '--quiet'], cwd=self.repo_dir)
# Return code is 0
return False
except subprocess.CalledProcessError:
return True
def update_remotes(self):
"""
Do a git fetch so our remotes are up to date
"""
yield from execute_cmd(['git', 'fetch'], cwd=self.repo_dir)
def find_upstream_changed(self, kind):
"""
Return list of files that have been changed upstream belonging to a particular kind of change
"""
output = subprocess.check_output([
'git', 'log', '..origin/{}'.format(self.branch_name),
'--oneline', '--name-status'
], cwd=self.repo_dir).decode()
files = []
for line in output.split('\n'):
if line.startswith(kind):
files.append(os.path.join(self.repo_dir, line.split('\t', 1)[1]))
return files
def ensure_lock(self):
"""
Make sure we have the .git/lock required to do modifications on the repo
This must be called before any git commands that modify state. This isn't guaranteed
to be atomic, due to the nature of using files for locking. But it's the best we
can do right now.
"""
try:
lockpath = os.path.join(self.repo_dir, '.git', 'index.lock')
mtime = os.path.getmtime(lockpath)
# A lock file does exist
# If it's older than 10 minutes, we just assume it is stale and take over
# If not, we fail with an explicit error.
if time.time() - mtime > 600:
yield "Stale .git/index.lock found, attempting to remove"
os.remove(lockpath)
yield "Stale .git/index.lock removed"
else:
raise Exception('Recent .git/index.lock found, operation can not proceed. Try again in a few minutes.')
except FileNotFoundError:
# No lock is held by other processes, we are free to go
return
def rename_local_untracked(self):
"""
Rename local untracked files that would require pulls
"""
# Find what files have been added!
new_upstream_files = self.find_upstream_changed('A')
for f in new_upstream_files:
if os.path.exists(f):
# If there's a file extension, put the timestamp before that
ts = datetime.datetime.now().strftime('__%Y%m%d%H%M%S')
path_head, path_tail = os.path.split(f)
path_tail = ts.join(os.path.splitext(path_tail))
new_file_name = os.path.join(path_head, path_tail)
os.rename(f, new_file_name)
yield 'Renamed {} to {} to avoid conflict with upstream'.format(f, new_file_name)
def update(self):
"""
Do the pulling if necessary
"""
# Fetch remotes, so we know we're dealing with latest remote
yield from self.update_remotes()
# Rename local untracked files that might be overwritten by pull
yield from self.rename_local_untracked()
# Reset local files that have been deleted. We don't actually expect users to
# delete something that's present upstream and expect to keep it. This prevents
# unnecessary conflicts, and also allows users to click the link again to get
# a fresh copy of a file they might have screwed up.
yield from self.reset_deleted_files()
# If there are local changes, make a commit so we can do merges when pulling
# We also allow empty commits. On NFS (at least), sometimes repo_is_dirty returns a false
# positive, returning True even when there are no local changes (git diff-files seems to return
# bogus output?). While ideally that would not happen, allowing empty commits keeps us
# resilient to that issue.
# We explicitly set user info of the commits we are making, to keep that separate from
# whatever author info is set in system / repo config by the user. We pass '-c' to git
# itself (rather than to 'git commit') to temporarily set config variables. This is
# better than passing --author, since git treats author separately from committer.
if self.repo_is_dirty():
yield from self.ensure_lock()
yield from execute_cmd([
'git',
'-c', 'user.email=nbgitpuller@nbgitpuller.link',
'-c', 'user.name=nbgitpuller',
'commit',
'-am', 'Automatic commit by nbgitpuller',
'--allow-empty'
], cwd=self.repo_dir)
# Merge master into local!
yield from self.ensure_lock()
yield from execute_cmd([
'git',
'-c', 'user.email=nbgitpuller@nbgitpuller.link',
'-c', 'user.name=nbgitpuller',
'merge',
'-Xours', 'origin/{}'.format(self.branch_name)
], cwd=self.repo_dir)
def main():
"""
Synchronizes a github repository with a local repository.
"""
logging.basicConfig(
format='[%(asctime)s] %(levelname)s -- %(message)s',
level=logging.DEBUG)
parser = argparse.ArgumentParser(description='Synchronizes a github repository with a local repository.')
parser.add_argument('git_url', help='Url of the repo to sync')
parser.add_argument('branch_name', default=None, help='Branch of repo to sync', nargs='?')
parser.add_argument('repo_dir', default='.', help='Path to clone repo under', nargs='?')
args = parser.parse_args()
for line in GitPuller(
args.git_url,
args.repo_dir,
branch=args.branch_name if args.branch_name else None
).pull():
print(line)
if __name__ == '__main__':
main()