lifeiteng · lifeiteng · Apr 14, 2023 · Mar 22, 2023 · Mar 31, 2023 · Apr 12, 2023
diff --git a/.flake8 b/.flake8
@@ -19,7 +19,8 @@ per-file-ignores =
     valle/data/fbank.py: E501,
     valle/data/input_strategies.py: E501,
     valle/data/datamodule.py: E501,
-    valle/tests/*.py: F841
+    valle/tests/*.py: F841, F401
+    valle/tests/data/*.py: F841, F401
 
 exclude =
   .git,

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,15 +7,17 @@ repos:
         additional_dependencies: ['click==8.0.1']
         exclude: valle\/__init__\.py
         exclude: valle\/utils\/symbol_table\.py
+        exclude: valle\/tests\/data\/tokenizer\_test\.py
 
   - repo: https://github.com/PyCQA/flake8
     rev: 3.9.2
     hooks:
       - id: flake8
         args: [--max-line-length=80]
+        exclude: valle\/tests\/data\/tokenizer\_test\.py
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.9.2
+    rev: 5.12.0
     hooks:
       - id: isort
         args: [--profile=black, --line-length=80]

diff --git a/egs/aishell1/README.md b/egs/aishell1/README.md
@@ -0,0 +1,161 @@
+# aishell1
+## Install deps
+```
+pip install pypinyin
+```
+
+## Prepare Dataset
+```
+cd egs/aishell1
+
+# Those stages are very time-consuming
+bash prepare.sh --stage -1 --stop-stage 3
+
+##  train
+Cut statistics:
+╒═══════════════════════════╤═══════════╕
+│ Cuts count:               │ 120098    │
+├───────────────────────────┼───────────┤
+│ Total duration (hh:mm:ss) │ 150:51:08 │
+├───────────────────────────┼───────────┤
+│ mean                      │ 4.5       │
+├───────────────────────────┼───────────┤
+│ std                       │ 1.4       │
+├───────────────────────────┼───────────┤
+│ min                       │ 1.2       │
+├───────────────────────────┼───────────┤
+│ 25%                       │ 3.5       │
+├───────────────────────────┼───────────┤
+│ 50%                       │ 4.3       │
+├───────────────────────────┼───────────┤
+│ 75%                       │ 5.3       │
+├───────────────────────────┼───────────┤
+│ 99%                       │ 8.5       │
+├───────────────────────────┼───────────┤
+│ 99.5%                     │ 9.1       │
+├───────────────────────────┼───────────┤
+│ 99.9%                     │ 10.5      │
+├───────────────────────────┼───────────┤
+│ max                       │ 14.5      │
+├───────────────────────────┼───────────┤
+│ Recordings available:     │ 120098    │
+├───────────────────────────┼───────────┤
+│ Features available:       │ 120098    │
+├───────────────────────────┼───────────┤
+│ Supervisions available:   │ 120098    │
+╘═══════════════════════════╧═══════════╛
+SUPERVISION custom fields:
+Speech duration statistics:
+╒══════════════════════════════╤═══════════╤══════════════════════╕
+│ Total speech duration        │ 150:51:08 │ 100.00% of recording │
+├──────────────────────────────┼───────────┼──────────────────────┤
+│ Total speaking time duration │ 150:51:08 │ 100.00% of recording │
+├──────────────────────────────┼───────────┼──────────────────────┤
+│ Total silence duration       │ 00:00:00  │ 0.00% of recording   │
+╘══════════════════════════════╧═══════════╧══════════════════════╛
+
+
+##  dev
+Cut statistics:
+╒═══════════════════════════╤══════════╕
+│ Cuts count:               │ 400      │
+├───────────────────────────┼──────────┤
+│ Total duration (hh:mm:ss) │ 00:28:37 │
+├───────────────────────────┼──────────┤
+│ mean                      │ 4.3      │
+├───────────────────────────┼──────────┤
+│ std                       │ 1.1      │
+├───────────────────────────┼──────────┤
+│ min                       │ 2.3      │
+├───────────────────────────┼──────────┤
+│ 25%                       │ 3.5      │
+├───────────────────────────┼──────────┤
+│ 50%                       │ 4.0      │
+├───────────────────────────┼──────────┤
+│ 75%                       │ 5.0      │
+├───────────────────────────┼──────────┤
+│ 99%                       │ 7.4      │
+├───────────────────────────┼──────────┤
+│ 99.5%                     │ 7.5      │
+├───────────────────────────┼──────────┤
+│ 99.9%                     │ 8.0      │
+├───────────────────────────┼──────────┤
+│ max                       │ 8.0      │
+├───────────────────────────┼──────────┤
+│ Recordings available:     │ 400      │
+├───────────────────────────┼──────────┤
+│ Features available:       │ 400      │
+├───────────────────────────┼──────────┤
+│ Supervisions available:   │ 400      │
+╘═══════════════════════════╧══════════╛
+SUPERVISION custom fields:
+Speech duration statistics:
+╒══════════════════════════════╤══════════╤══════════════════════╕
+│ Total speech duration        │ 00:28:37 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total speaking time duration │ 00:28:37 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total silence duration       │ 00:00:00 │ 0.00% of recording   │
+╘══════════════════════════════╧══════════╧══════════════════════╛
+
+
+##  test
+Cut statistics:
+╒═══════════════════════════╤══════════╕
+│ Cuts count:               │ 7176     │
+├───────────────────────────┼──────────┤
+│ Total duration (hh:mm:ss) │ 10:01:49 │
+├───────────────────────────┼──────────┤
+│ mean                      │ 5.0      │
+├───────────────────────────┼──────────┤
+│ std                       │ 1.6      │
+├───────────────────────────┼──────────┤
+│ min                       │ 1.9      │
+├───────────────────────────┼──────────┤
+│ 25%                       │ 3.8      │
+├───────────────────────────┼──────────┤
+│ 50%                       │ 4.7      │
+├───────────────────────────┼──────────┤
+│ 75%                       │ 5.9      │
+├───────────────────────────┼──────────┤
+│ 99%                       │ 9.9      │
+├───────────────────────────┼──────────┤
+│ 99.5%                     │ 10.7     │
+├───────────────────────────┼──────────┤
+│ 99.9%                     │ 11.9     │
+├───────────────────────────┼──────────┤
+│ max                       │ 14.7     │
+├───────────────────────────┼──────────┤
+│ Recordings available:     │ 7176     │
+├───────────────────────────┼──────────┤
+│ Features available:       │ 7176     │
+├───────────────────────────┼──────────┤
+│ Supervisions available:   │ 7176     │
+╘═══════════════════════════╧══════════╛
+SUPERVISION custom fields:
+Speech duration statistics:
+╒══════════════════════════════╤══════════╤══════════════════════╕
+│ Total speech duration        │ 10:01:49 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total speaking time duration │ 10:01:49 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total silence duration       │ 00:00:00 │ 0.00% of recording   │
+╘══════════════════════════════╧══════════╧══════════════════════╛
+```
+
+
+## Training
+refer to [LibriTTS Training](../libritts/README.md#Training)
+
+## Inference
+* make sure `--decoder-dim 1024 ...` are same with `Training`
+```
+python bin/infer.py --output-dir demos \
+    --top-k -1 --temperature 1.0 \
+    --model-name "VALL-E" --norm-first true --add-prenet false \
+    --decoder-dim 1024 --nhead 16 --num-decoder-layers 12 --prefix-mode 1 \
+    --text-prompts "甚至出现交易几乎停滞的情况" \
+    --audio-prompts ./prompts/ch_24k.wav \
+    --text "大家好非常高兴能认识大家" \
+    --checkpoint exp/valle/best-train-loss.pt
+```
diff --git a/egs/aishell1/bin b/egs/aishell1/bin
@@ -0,0 +1 @@
+../../valle/bin
diff --git a/egs/aishell1/demos/0_demo.wav b/egs/aishell1/demos/0_demo.wav
diff --git a/egs/aishell1/prepare.sh b/egs/aishell1/prepare.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+nj=16
+stage=-1
+stop_stage=4
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/aishell
+#      You can download aishell from https://www.openslr.org/33/
+#
+
+dl_dir=$PWD/download
+
+dataset_parts="-p train -p dev -p test"  # debug
+
+text_extractor="pypinyin_initials_finals"
+audio_extractor="Encodec"  # or Fbank
+audio_feats_dir=data/tokenized
+
+. shared/parse_options.sh || exit 1
+
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "dl_dir: $dl_dir"
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/aishell,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/aishell $dl_dir/aishell
+  #
+  if [ ! -d $dl_dir/aishell/dev ]; then
+    lhotse download aishell $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare aishell manifest"
+  # We assume that you have downloaded the aishell corpus
+  # to $dl_dir/aishell
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.aishell.done ]; then
+    lhotse prepare aishell $dl_dir/aishell data/manifests
+    touch data/manifests/.aishell.done
+  fi
+fi
+
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Tokenize/Fbank aishell"
+  mkdir -p ${audio_feats_dir}
+  if [ ! -e ${audio_feats_dir}/.aishell.tokenize.done ]; then
+    python3 bin/tokenizer.py --dataset-parts "${dataset_parts}" \
+        --text-extractor ${text_extractor} \
+        --audio-extractor ${audio_extractor} \
+        --prefix "aishell" \
+        --src-dir "data/manifests" \
+        --output-dir "${audio_feats_dir}"
+  fi
+  touch ${audio_feats_dir}/.aishell.tokenize.done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare aishell train/dev/test"
+  if [ ! -e ${audio_feats_dir}/.aishell.train.done ]; then
+
+    # train
+    lhotse copy \
+        ${audio_feats_dir}/aishell_cuts_train.jsonl.gz \
+        ${audio_feats_dir}/cuts_train.jsonl.gz
+    # dev
+    lhotse subset --first 400 \
+        ${audio_feats_dir}/aishell_cuts_dev.jsonl.gz \
+        ${audio_feats_dir}/cuts_dev.jsonl.gz
+
+
+    # test
+    lhotse copy \
+      ${audio_feats_dir}/aishell_cuts_test.jsonl.gz \
+      ${audio_feats_dir}/cuts_test.jsonl.gz
+
+    touch ${audio_feats_dir}/.aishell.train.done
+  fi
+fi
diff --git a/egs/aishell1/prompts/ch_24k.txt b/egs/aishell1/prompts/ch_24k.txt
@@ -0,0 +1 @@
+大家好非常高兴能认识大家
diff --git a/egs/aishell1/prompts/ch_24k.wav b/egs/aishell1/prompts/ch_24k.wav
diff --git a/egs/aishell1/shared b/egs/aishell1/shared
@@ -0,0 +1 @@
+../libritts/shared/
diff --git a/valle/bin/tokenizer.py b/valle/bin/tokenizer.py
@@ -66,6 +66,12 @@ def get_args():
         default=Path("data/tokenized"),
         help="Path to the tokenized files",
     )
+    parser.add_argument(
+        "--text-extractor",
+        type=str,
+        default="espeak",
+        help="espeak or pypinyin or pypinyin_initials_finals",
+    )
     parser.add_argument(
         "--audio-extractor",
         type=str,
@@ -118,7 +124,7 @@ def main():
         suffix=args.suffix,
     )
 
-    text_tokenizer = TextTokenizer()
+    text_tokenizer = TextTokenizer(backend=args.text_extractor)
 
     # Fix RuntimeError: Cowardly refusing to serialize non-leaf tensor...
     # by remove encodec weight_norm
@@ -158,7 +164,7 @@ def main():
                     f"{args.output_dir}/{args.prefix}_fbank_{partition}"
                 )
 
-            if args.prefix == "ljspeech":
+            if args.prefix == "ljspeech" or args.prefix == "aishell":
                 cut_set = cut_set.resample(24000)
 
             with torch.no_grad():
@@ -176,13 +182,18 @@ def main():
                     text = c.supervisions[0].custom["normalized_text"]
                     text = text.replace("”", '"').replace("“", '"')
                     phonemes = tokenize_text(text_tokenizer, text=text)
+                elif args.prefix == "aishell":
+                    phonemes = tokenize_text(
+                        text_tokenizer, text=c.supervisions[0].text
+                    )
+                    c.supervisions[0].custom = {}
                 else:
                     assert args.prefix == "libritts"
                     phonemes = tokenize_text(
                         text_tokenizer, text=c.supervisions[0].text
                     )
                 c.supervisions[0].custom["tokens"] = {"text": phonemes}
-                unique_symbols.update(list(phonemes))
+                unique_symbols.update(phonemes)
 
             cuts_filename = f"{args.prefix}_cuts_{partition}.{args.suffix}"
             cut_set.to_file(f"{args.output_dir}/{cuts_filename}")