Skip to content

Remove fast export munging #191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Documentation/git-fast-export.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,13 @@ marks the same across runs.
for intermediary filters (e.g. for rewriting commit messages
which refer to older commits, or for stripping blobs by id).

--reencode=(yes|no|abort)::
Specify how to handle `encoding` header in commit objects. When
asking to 'abort' (which is the default), this program will die
when encountering such a commit object. With 'yes', the commit
message will be reencoded into UTF-8. With 'no', the original
encoding will be preserved.

--refspec::
Apply the specified refspec to each ref exported. Multiple of them can
be specified.
Expand Down
7 changes: 7 additions & 0 deletions Documentation/git-fast-import.txt
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ change to the project.
original-oid?
('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
'committer' (SP <name>)? SP LT <email> GT SP <when> LF
('encoding' SP <encoding>)?
data
('from' SP <commit-ish> LF)?
('merge' SP <commit-ish> LF)?
Expand Down Expand Up @@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
See ``Date Formats'' above for the set of supported formats, and
their syntax.

`encoding`
^^^^^^^^^^
The optional `encoding` command indicates the encoding of the commit
message. Most commits are UTF-8 and the encoding is omitted, but this
allows importing commit messages into git without first reencoding them.

`from`
^^^^^^
The `from` command is used to specify the commit to initialize
Expand Down
55 changes: 49 additions & 6 deletions builtin/fast-export.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
static int progress;
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
static int fake_missing_tagger;
static int use_done_feature;
static int no_data;
Expand Down Expand Up @@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
return 0;
}

static int parse_opt_reencode_mode(const struct option *opt,
const char *arg, int unset)
{
if (unset) {
reencode_mode = REENCODE_ABORT;
return 0;
}

switch (git_parse_maybe_bool(arg)) {
case 0:
reencode_mode = REENCODE_NO;
break;
case 1:
reencode_mode = REENCODE_YES;
break;
default:
if (!strcasecmp(arg, "abort"))
reencode_mode = REENCODE_ABORT;
else
return error("Unknown reencoding mode: %s", arg);
}

return 0;
}

static struct decoration idnums;
static uint32_t last_idnum;

Expand Down Expand Up @@ -453,7 +479,7 @@ static const char *find_encoding(const char *begin, const char *end)
bol = memmem(begin, end ? end - begin : strlen(begin),
needle, strlen(needle));
if (!bol)
return git_commit_encoding;
return NULL;
bol += strlen(needle);
eol = strchrnul(bol, '\n');
*eol = '\0';
Expand Down Expand Up @@ -633,18 +659,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
}

mark_next_object(&commit->object);
if (anonymize)
if (anonymize) {
reencoded = anonymize_commit_message(message);
else if (!is_encoding_utf8(encoding))
reencoded = reencode_string(message, "UTF-8", encoding);
} else if (encoding) {
switch(reencode_mode) {
case REENCODE_YES:
reencoded = reencode_string(message, "UTF-8", encoding);
break;
case REENCODE_NO:
break;
case REENCODE_ABORT:
die("Encountered commit-specific encoding %s in commit "
"%s; use --reencode=[yes|no] to handle it",
encoding, oid_to_hex(&commit->object.oid));
}
}
if (!commit->parents)
printf("reset %s\n", refname);
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
if (show_original_ids)
printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
printf("%.*s\n%.*s\ndata %u\n%s",
printf("%.*s\n%.*s\n",
(int)(author_end - author), author,
(int)(committer_end - committer), committer,
(int)(committer_end - committer), committer);
if (!reencoded && encoding)
printf("encoding %s\n", encoding);
printf("data %u\n%s",
(unsigned)(reencoded
? strlen(reencoded) : message
? strlen(message) : 0),
Expand Down Expand Up @@ -1088,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
N_("select handling of tags that tag filtered objects"),
parse_opt_tag_of_filtered_mode),
OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
N_("select handling of commit messages in an alternate encoding"),
parse_opt_reencode_mode),
OPT_STRING(0, "export-marks", &export_filename, N_("file"),
N_("Dump marks to this file")),
OPT_STRING(0, "import-marks", &import_filename, N_("file"),
Expand Down
11 changes: 9 additions & 2 deletions fast-import.c
Original file line number Diff line number Diff line change
Expand Up @@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
struct branch *b;
char *author = NULL;
char *committer = NULL;
const char *encoding = NULL;
struct hash_list *merge_list = NULL;
unsigned int merge_count;
unsigned char prev_fanout, new_fanout;
Expand All @@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
}
if (!committer)
die("Expected committer but didn't get one");
if (skip_prefix(command_buf.buf, "encoding ", &encoding))
read_next_command();
parse_data(&msg, 0, NULL);
read_next_command();
parse_from(b);
Expand Down Expand Up @@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
}
strbuf_addf(&new_data,
"author %s\n"
"committer %s\n"
"\n",
"committer %s\n",
author ? author : committer, committer);
if (encoding)
strbuf_addf(&new_data,
"encoding %s\n",
encoding);
strbuf_addch(&new_data, '\n');
strbuf_addbuf(&new_data, &msg);
free(author);
free(committer);
Expand Down
20 changes: 20 additions & 0 deletions t/t9300-fast-import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
'

###
### series X (other new features)
###

test_expect_success 'X: handling encoding' '
test_tick &&
cat >input <<-INPUT_END &&
commit refs/heads/encoding
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
encoding iso-8859-7
data <<COMMIT
INPUT_END

printf "Pi: \360\nCOMMIT\n" >>input &&

git fast-import <input &&
git cat-file -p encoding | grep $(printf "\360") &&
git log -1 --format=%B encoding | grep $(printf "\317\200")
'

test_done
78 changes: 69 additions & 9 deletions t/t9350-fast-export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,22 +94,83 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
test $MUSS = $(git rev-parse --verify refs/tags/muss)
'

test_expect_success 'iso-8859-1' '
test_expect_success 'reencoding iso-8859-7' '

git config i18n.commitencoding ISO8859-1 &&
# use author and committer name in ISO-8859-1 to match it.
. "$TEST_DIRECTORY"/t3901/8859-1.txt &&
test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
test_tick &&
echo rosten >file &&
git commit -s -m den file &&
git fast-export wer^..wer >iso8859-1.fi &&
sed "s/wer/i18n/" iso8859-1.fi |
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n/" iso-8859-7.fi |
(cd new &&
git fast-import &&
# The commit object, if not re-encoded, would be 240 bytes.
# Removing the "encoding iso-8859-7\n" header drops 20 bytes.
# Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
# to \xCF\x80 (\317\200) in UTF-8 adds a byte. Check for
# the expected size.
test 221 -eq "$(git cat-file -s i18n)" &&
# ...and for the expected translation of bytes.
git cat-file commit i18n >actual &&
grep "Áéí óú" actual)
grep $(printf "\317\200") actual &&
# Also make sure the commit does not have the "encoding" header
! grep ^encoding actual)
'

test_expect_success 'aborting on iso-8859-7' '

test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
'

test_expect_success 'preserving iso-8859-7' '

test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
(cd new &&
git fast-import &&
# The commit object, if not re-encoded, is 240 bytes.
# Removing the "encoding iso-8859-7\n" header would drops 20
# bytes. Re-encoding the Pi character from \xF0 (\360) in
# iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
# Check for the expected size...
test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
# ...as well as the expected byte.
git cat-file commit i18n-no-recoding >actual &&
grep $(printf "\360") actual &&
# Also make sure the commit has the "encoding" header
grep ^encoding actual)
'

test_expect_success 'encoding preserved if reencoding fails' '

test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n-invalid/" iso-8859-7.fi |
(cd new &&
git fast-import &&
git cat-file commit i18n-invalid >actual &&
# Make sure the commit still has the encoding header
grep ^encoding actual &&
# Verify that the commit has the expected size; i.e.
# that no bytes were re-encoded to a different encoding.
test 252 -eq "$(git cat-file -s i18n-invalid)" &&
# ...and check for the original special bytes
grep $(printf "\360") actual &&
grep $(printf "\377") actual)
'

test_expect_success 'import/export-marks' '

git checkout -b marks master &&
Expand Down Expand Up @@ -224,7 +285,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME

test_expect_success 'setup copies' '

git config --unset i18n.commitencoding &&
git checkout -b copy rein &&
git mv file file3 &&
git commit -m move1 &&
Expand Down
1 change: 1 addition & 0 deletions t/t9350/broken-iso-8859-7-commit-message.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Pi: �; Invalid: �
1 change: 1 addition & 0 deletions t/t9350/simple-iso-8859-7-commit-message.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Pi: �