Skip to content

Remove fast export munging 2 #187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Documentation/git-fast-import.txt
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ change to the project.
original-oid?
('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
'committer' (SP <name>)? SP LT <email> GT SP <when> LF
('encoding' SP <encoding>)?
data
('from' SP <commit-ish> LF)?
('merge' SP <commit-ish> LF)?
Expand Down Expand Up @@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
See ``Date Formats'' above for the set of supported formats, and
their syntax.

`encoding`
^^^^^^^^^^
The optional `encoding` command indicates the encoding of the commit
message. Most commits are UTF-8 and the encoding is omitted, but this
allows importing commit messages into git without first reencoding them.

`from`
^^^^^^
The `from` command is used to specify the commit to initialize
Expand Down
44 changes: 38 additions & 6 deletions builtin/fast-export.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
static int progress;
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER } reencode_mode = REENCODE_ABORT;
static int fake_missing_tagger;
static int use_done_feature;
static int no_data;
Expand Down Expand Up @@ -77,6 +78,20 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
return 0;
}

static int parse_opt_reencode_mode(const struct option *opt,
const char *arg, int unset)
{
if (unset || !strcmp(arg, "abort"))
reencode_mode = REENCODE_ABORT;
else if (!strcmp(arg, "yes"))
reencode_mode = REENCODE_PLEASE;
else if (!strcmp(arg, "no"))
reencode_mode = REENCODE_NEVER;
else
return error("Unknown reencoding mode: %s", arg);
return 0;
}

static struct decoration idnums;
static uint32_t last_idnum;

Expand Down Expand Up @@ -453,7 +468,7 @@ static const char *find_encoding(const char *begin, const char *end)
bol = memmem(begin, end ? end - begin : strlen(begin),
needle, strlen(needle));
if (!bol)
return git_commit_encoding;
return NULL;
bol += strlen(needle);
eol = strchrnul(bol, '\n');
*eol = '\0';
Expand Down Expand Up @@ -633,18 +648,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
}

mark_next_object(&commit->object);
if (anonymize)
if (anonymize) {
reencoded = anonymize_commit_message(message);
else if (!is_encoding_utf8(encoding))
reencoded = reencode_string(message, "UTF-8", encoding);
} else if (encoding) {
switch(reencode_mode) {
case REENCODE_PLEASE:
reencoded = reencode_string(message, "UTF-8", encoding);
break;
case REENCODE_NEVER:
break;
case REENCODE_ABORT:
die("Encountered commit-specific encoding %s in commit "
"%s; use --reencode=<mode> to handle it",
encoding, oid_to_hex(&commit->object.oid));
}
}
if (!commit->parents)
printf("reset %s\n", refname);
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
if (show_original_ids)
printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
printf("%.*s\n%.*s\ndata %u\n%s",
printf("%.*s\n%.*s\n",
(int)(author_end - author), author,
(int)(committer_end - committer), committer,
(int)(committer_end - committer), committer);
if (!reencoded && encoding)
printf("encoding %s\n", encoding);
printf("data %u\n%s",
(unsigned)(reencoded
? strlen(reencoded) : message
? strlen(message) : 0),
Expand Down Expand Up @@ -1088,6 +1117,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
N_("select handling of tags that tag filtered objects"),
parse_opt_tag_of_filtered_mode),
OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
N_("select handling of commit messages in an alternate encoding"),
parse_opt_reencode_mode),
OPT_STRING(0, "export-marks", &export_filename, N_("file"),
N_("Dump marks to this file")),
OPT_STRING(0, "import-marks", &import_filename, N_("file"),
Expand Down
11 changes: 9 additions & 2 deletions fast-import.c
Original file line number Diff line number Diff line change
Expand Up @@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
struct branch *b;
char *author = NULL;
char *committer = NULL;
const char *encoding = NULL;
struct hash_list *merge_list = NULL;
unsigned int merge_count;
unsigned char prev_fanout, new_fanout;
Expand All @@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
}
if (!committer)
die("Expected committer but didn't get one");
if (skip_prefix(command_buf.buf, "encoding ", &encoding))
read_next_command();
parse_data(&msg, 0, NULL);
read_next_command();
parse_from(b);
Expand Down Expand Up @@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
}
strbuf_addf(&new_data,
"author %s\n"
"committer %s\n"
"\n",
"committer %s\n",
author ? author : committer, committer);
if (encoding)
strbuf_addf(&new_data,
"encoding %s\n",
encoding);
strbuf_addch(&new_data, '\n');
strbuf_addbuf(&new_data, &msg);
free(author);
free(committer);
Expand Down
20 changes: 20 additions & 0 deletions t/t9300-fast-import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
'

###
### series X (other new features)
###

test_expect_success 'X: handling encoding' '
test_tick &&
cat >input <<-INPUT_END &&
commit refs/heads/encoding
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
encoding iso-8859-7
data <<COMMIT
INPUT_END

printf "Pi: \360\nCOMMIT\n" >>input &&

git fast-import <input &&
git cat-file -p encoding | grep $(printf "\360") &&
git log -1 --format=%B encoding | grep $(printf "\317\200")
'

test_done
75 changes: 66 additions & 9 deletions t/t9350-fast-export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,22 +94,80 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
test $MUSS = $(git rev-parse --verify refs/tags/muss)
'

test_expect_success 'iso-8859-1' '
test_expect_success 'reencoding iso-8859-7' '

git config i18n.commitencoding ISO8859-1 &&
# use author and committer name in ISO-8859-1 to match it.
. "$TEST_DIRECTORY"/t3901/8859-1.txt &&
test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
test_tick &&
echo rosten >file &&
git commit -s -m den file &&
git fast-export wer^..wer >iso8859-1.fi &&
sed "s/wer/i18n/" iso8859-1.fi |
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n/" iso-8859-7.fi |
(cd new &&
git fast-import &&
# The commit object, if not re-encoded, would be 240 bytes.
# Removing the "encoding iso-8859-7\n" header drops 20 bytes.
# Re-encoding the Pi character from \xF0 in iso-8859-7 to
# \xCF\x80 in utf-8 adds a byte. Grepping for specific bytes
# would be nice, but Windows apparently munges user data
# in the form of bytes on the command line to force them to
# be characters instead, so we are limited for portability
# reasons in subsequent similar tests in this file to check
# for size rather than what bytes are present.
test 221 -eq "$(git cat-file -s i18n)" &&
# Also make sure the commit does not have the "encoding" header
git cat-file commit i18n >actual &&
grep "Áéí óú" actual)
! grep ^encoding actual)
'

test_expect_success 'aborting on iso-8859-7' '

test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
'

test_expect_success 'preserving iso-8859-7' '

test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
(cd new &&
git fast-import &&
# The commit object, if not re-encoded, is 240 bytes.
# Removing the "encoding iso-8859-7\n" header would drops 20
# bytes. Re-encoding the Pi character from \xF0 in
# iso-8859-7 to \xCF\x80 in utf-8 would add a byte. I would
# grep for the # specific bytes, but Windows lamely does not
# allow that, so just search for the expected size.
test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
# Also make sure the commit has the "encoding" header
git cat-file commit i18n-no-recoding >actual &&
grep ^encoding actual)
'

test_expect_success 'encoding preserved if reencoding fails' '

test_when_finished "git reset --hard HEAD~1" &&
test_config i18n.commitencoding iso-8859-7 &&
echo rosten >file &&
git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
sed "s/wer/i18n-invalid/" iso-8859-7.fi |
(cd new &&
git fast-import &&
git cat-file commit i18n-invalid >actual &&
grep ^encoding actual &&
# Also verify that the commit has the expected size; i.e.
# that no bytes were re-encoded to a different encoding.
test 252 -eq "$(git cat-file -s i18n-invalid)")
'

test_expect_success 'import/export-marks' '

git checkout -b marks master &&
Expand Down Expand Up @@ -224,7 +282,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME

test_expect_success 'setup copies' '

git config --unset i18n.commitencoding &&
git checkout -b copy rein &&
git mv file file3 &&
git commit -m move1 &&
Expand Down
1 change: 1 addition & 0 deletions t/t9350/broken-iso-8859-7-commit-message.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Pi: �; Invalid: �
1 change: 1 addition & 0 deletions t/t9350/simple-iso-8859-7-commit-message.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Pi: �