diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..ecec9be --- /dev/null +++ b/.editorconfig @@ -0,0 +1,2 @@ +indent_style=space +indent_size=4 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0a29d82..a23f4a0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,24 +2,34 @@ __pycache__/ # Environments .env +.envcpu +.envgpu .venv +.venvcpu +.venvgpu env/ +envcpu/ +envgpu/ venv/ venvcpu/ +venvgpu/ ENV/ env.bak/ venv.bak/ venvtest/ # Project specific -user/ -temp/ -ignore/ +speech_translate/_user/ +speech_translate/temp/ +speech_translate/debug/ +speech_translate/export/ +speech_translate/log/ build/ -log/ dist/ output/ -export/ + +# ignore +ignore/ # created when building LICENSE.txt diff --git a/.vscode/settings.json b/.vscode/settings.json index 112d580..023b931 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,27 @@ { - "python.analysis.typeCheckingMode": "basic" -} + "python.languageServer": "Pylance", + "python.analysis.typeCheckingMode": "basic", + "[python]": { + "editor.defaultFormatter": "eeyore.yapf", + "editor.formatOnSave": true, + "editor.formatOnPaste": true, + "editor.formatOnType": false, + "editor.codeActionsOnSave": { + "source.fixAll": false, + "source.organizeImports": false, + "source.organizeImports.ruff": false, + "source.organizeImports.python": false, + } + }, + "yapf.args": ["--style", "{based_on_style: pep8, indent_width: 4, column_limit: 125, BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF: false, DEDENT_CLOSING_BRACKETS: true}"], + "ruff.enable": true, + "ruff.lint.args": [ + "--line-length", + "125" + ], + "ruff.format.args": [ + "--line-length", + "125" + ], + "python.analysis.autoImportCompletions": false, +} \ No newline at end of file diff --git a/README.md b/README.md index 47e47d4..fcfeee8 100644 --- a/README.md +++ b/README.md @@ -15,23 +15,40 @@ GitHub forks

-Speech Translate is a practical application that combines OpenAI's Whisper ASR model with free translation APIs. It serves as a versatile tool for both real-time / live speech-to-text and speech translation, allowing the user to seamlessly convert spoken language into written text. Additionally, it has the option to import and transcribe audio / video files effortlessly. This application aims to expand whisper ability by combining it with some translation APIs while also providing a simple and easy to use interface to create a more practical application. This application is also open source, so you can contribute to this project if you want to. - -
- Preview -

- Speech Translate Looks - Setting transcription - Setting textbox - About window - Detached window preview - Detached window preview - Transcribe mode on detached window (English) - Transcribe mode on detached window (English) - Translate mode on detached window (English to Indonesia) - Translate mode on detached window (English to Indonesia) -

-
+Speech Translate is a practical application that combines OpenAI's Whisper ASR model with free translation APIs. It serves as a versatile tool for both real-time / live speech-to-text and speech translation, allowing the user to seamlessly convert spoken language into written text. Additionally, it has the option to import and transcribe audio / video files effortlessly. + +Speech Translate aims to expand whisper ability by combining it with some translation APIs while also providing a simple and easy to use interface to create a more practical application. This application is also open source, so you can contribute to this project if you want to. + +

+ Speech Translate Preview +

+ +
+ Preview - Usage +

+ Record + File import + File import in progress + Align result + Refine result + Translate Result + Transcribe mode on subtitle window (English)
+ Transcribe mode on detached window (English) + Translate mode on subtitle window (English to Indonesia)
+ Translate mode on detached window (English to Indonesia) +

+
+ +
+ Preview - Setting +

+ Setting - General + Setting - Record + Setting - Transcribe + Setting - Translate + Setting - Textbox +

+

@@ -74,9 +91,16 @@ Speech Translate is a practical application that combines OpenAI's Whisper ASR m - Speaker input only work on windows 8 and above. - Internet connection (for translation with API) -- [FFmpeg](https://ffmpeg.org/) is required to be installed and added to the PATH environment variable. You can download it [here](https://ffmpeg.org/download.html) and add it to your path manually OR you can do it automatically using the following commands: +- [FFmpeg](https://ffmpeg.org/) is required to be installed and added to the PATH environment variable. You can do it when prompted in the app, or you can download it [here](https://ffmpeg.org/download.html) and add it to your path manually. Alternatively, you can also download and add it to path automatically by using the following commands: + +```bash +# on Windows using powershell (Also included in the release page, and can be run by right clicking and selecting "Run with PowerShell") +# Must be run in an elevated PowerShell prompt (Run as administrator) +Set-ExecutionPolicy RemoteSigned -Scope CurrentUser # Optional: Needed to run a remote script the first time +& ([scriptblock]::Create( + (New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Dadangdut33/Speech-Translate/master/install_ffmpeg.ps1') + )) -webdl -``` # on Windows using Winget (Default package manager for Windows 10 and above) winget install --id=Gyan.FFmpeg -e @@ -106,20 +130,21 @@ brew install ffmpeg | medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x | | large | 1550 M | N/A | `large` | ~10 GB | 1x | -\* This information is also available in the app (hover over the model selection in the app and there will be a tooltip about the model info). +\* This information is also available in the app (hover over the model selection in the app and there will be a tooltip about the model info). Also note that when using faster-whisper, the speed will be significantly faster and the model size will be reduced depending on the usage, for more information about this please visit [faster-whisper repository](https://github.com/guillaumekln/faster-whisper) # Installation > [!IMPORTANT] -> Make sure that you have installed [FFmpeg](https://ffmpeg.org/) and added it to the PATH environment variable. [See here](#requirements) for more info +> Please take a look at the [Requirements](#requirements) first before installing. For more information about the usage of the app, please check the [wiki](https://github.com/Dadangdut33/Speech-Translate/wiki) ## From Prebuilt Binary 1. Download the [latest release](https://github.com/Dadangdut33/Speech-Translate/releases/latest) (There are 2 versions, CPU and GPU) 2. Install/extract the downloaded file 3. Run the program -4. Enjoy! +4. Set the settings to your liking +5. Enjoy! ## As A Module @@ -143,9 +168,9 @@ You can then run the program by typing `speech-translate` in your terminal/conso **Notes For Installation as Module:** -- If you are u**pdating from an older version**, you need to add `--upgrade --no-deps --force-reinstall` at the end of the command. +- If you are **updating from an older version**, you need to add `--upgrade --force-reinstall` at the end of the command, if the update does not need new dependencies you can add `--no-deps` at the end of the command to speed up the installation process. - If you want to **install** from a **specific branch or commit**, you can do it by adding `@branch_name` or `@commit_hash` at the end of the url. Example: `pip install -U git+https://github.com/Dadangdut33/Speech-Translate.git@dev --extra-index-url https://download.pytorch.org/whl/cu118` -- The **--extra-index-url here might not always be up to date**, so you can check the latest version of pytorch [here](https://pytorch.org/get-started/locally/). You can also check the available version of pytorch [here](https://download.pytorch.org/whl/torch_stable.html). +- The **--extra-index-url here might not always be up to date**, so you can check the latest version of pytorch [here](https://pytorch.org/get-started/locally/). You can also check the available version of pytorch [here](https://download.pytorch.org/whl/torch_stable.html). If the newest version is not compatible then please keep using the current url shown here. # More Information diff --git a/Run.py b/Run.py index 1472f19..4dde100 100644 --- a/Run.py +++ b/Run.py @@ -3,4 +3,4 @@ if __name__ == "__main__": main() -# can run the app from this file or by running `python -m speech_translate` \ No newline at end of file +# can run the app from this file or by running `python -m speech_translate` diff --git a/_pyinstaller_hooks/add_lib.py b/_pyinstaller_hooks/add_lib.py deleted file mode 100644 index c740ee7..0000000 --- a/_pyinstaller_hooks/add_lib.py +++ /dev/null @@ -1,4 +0,0 @@ -import sys -import os - -sys.path.append(os.path.join(os.path.dirname(sys.argv[0]), "lib")) diff --git a/build.py b/build.py new file mode 100644 index 0000000..869b1d6 --- /dev/null +++ b/build.py @@ -0,0 +1,122 @@ +import sys +import os +import shutil +from cx_Freeze import setup, Executable + +sys.setrecursionlimit(5000) + + +def get_env_name(): + return os.path.basename(sys.prefix) + + +def version(): + with open(os.path.join(os.path.dirname(__file__), "speech_translate/_version.py")) as f: + return f.readline().split("=")[1].strip().strip('"').strip("'") + + +# If you get cuda error try to remove your cuda from your system path because cx_freeze will try to include it from there +# instead of the one in the python folder +print(">> Building SpeechTranslate version", version()) +print(">> Environment:", get_env_name()) + + +def clear_dir(dir): + print(">> Clearing", dir) + try: + if not os.path.exists(dir): + return + if os.path.isdir(dir): + for f in os.listdir(dir): + os.remove(os.path.join(dir, f)) + + # remove the folder + os.rmdir(dir) + else: + os.remove(dir) + except Exception as e: + print(f">> Failed to clear {dir} reason: {e}") + + +print(">> Clearing code folder") +clear_dir("./speech_translate/export") +clear_dir("./speech_translate/debug") +clear_dir("./speech_translate/log") +clear_dir("./speech_translate/temp") +print(">> Done") + +folder_name = f"build/SpeechTranslate {version()}" + +build_exe_options = { + "excludes": ["yapf", "ruff"], + "packages": ["torch", "soundfile", "sounddevice", "av"], + "build_exe": folder_name +} + +base = "Win32GUI" if sys.platform == "win32" else None + +setup( + name="SpeechTranslate", + version=version(), + description="Speech Translate", + options={ + "build_exe": build_exe_options, + }, + executables=[ + Executable( + "Run.py", + base=base, + icon="speech_translate/assets/icon.ico", + target_name="SpeechTranslate.exe", + ) + ], +) + +# check if arg is build_exe +if len(sys.argv) < 2 or sys.argv[1] != "build_exe": + sys.exit(0) + +print(">> Copying some more files...") + +# we need to copy av.libs to foldername/lib because cx_freeze doesn't copy it for some reason +print(">> Copying av.libs to lib folder") +shutil.copytree(f"{get_env_name()}/Lib/site-packages/av.libs", f"{folder_name}/lib/av.libs") + +# copy Lincese as license.txt to build folder +print(">> Creating license.txt to build folder") +with open("LICENSE", "r", encoding="utf-8") as f: + with open(f"{folder_name}/license.txt", "w", encoding="utf-8") as f2: + f2.write(f.read()) + +# copy README.md as README.txt to build folder +print(">> Creating README.txt to build folder") +with open("build/pre_install_note.txt", "r", encoding="utf-8") as f: + with open(f"{folder_name}/README.txt", "w", encoding="utf-8") as f2: + f2.write(f.read()) + +# create version.txt +print(">> Creating version.txt") +with open(f"{folder_name}/version.txt", "w", encoding="utf-8") as f: + f.write(version()) + +# copy install_ffmpeg.ps1 to build folder +print(">> Copying install_ffmpeg.ps1 to build folder") +with open("install_ffmpeg.ps1", "r", encoding="utf-8") as f: + with open(f"{folder_name}/install_ffmpeg.ps1", "w", encoding="utf-8") as f2: + f2.write(f.read()) + +# create link to repo +print(">> Creating link to repo") +with open(f"{folder_name}/homepage.url", "w", encoding="utf-8") as f: + f.write("[InternetShortcut]\n") + f.write("URL=https://github.com/Dadangdut33/Speech-Translate") + +print(">> Opening output folder") +output_folder = os.path.abspath(folder_name) +try: + os.startfile(output_folder) +except Exception: + # linux + import subprocess + + subprocess.call(["xdg-open", output_folder]) diff --git a/build/post_install_note.txt b/build/post_install_note.txt new file mode 100644 index 0000000..0a631ed --- /dev/null +++ b/build/post_install_note.txt @@ -0,0 +1,3 @@ +The app has been successfully installed, for more information about its usage please visit the wiki at https://github.com/Dadangdut33/Speech-Translate/wiki. + +For any questions or suggestions, feel free to add any issues or open a discussion on the repository. \ No newline at end of file diff --git a/build/pre_install_note.txt b/build/pre_install_note.txt new file mode 100644 index 0000000..15cfbe5 --- /dev/null +++ b/build/pre_install_note.txt @@ -0,0 +1,20 @@ +Thanks for downloading Speech Translate. + +Speech Translate is a practical application that combines OpenAI's Whisper ASR model with free translation APIs. It serves as a versatile tool for both real-time / live speech-to-text and speech translation, allowing the user to seamlessly convert spoken language into written text. Additionally, it has the option to import and transcribe audio / video files effortlessly. + +Requirements: +- Windows 8.1 or higher for speaker input +- FFmpeg installed in your system (the app will prompt you to install it if you don't have it) +- Internet connection (for translation with API) +- Each whisper model requires the following VRAM: + * tiny (~1 GB) + * base (~1 GB) + * small (~2 GB) + * medium (~5 GB) + * large (~10 GB) + +Whisper can be used with CPU but will be very limited when doing so. It is recommended to use a cuda compatible GPU for better performance. + +Please also note that when using faster-whisper, the speed will be significantly faster and the model size will be reduced depending on the usage. For more information about this please visit https://github.com/guillaumekln/faster-whisper + +For more information about the app, user settings, how to use it, and more please visit the wiki at https://github.com/Dadangdut33/Speech-Translate/wiki \ No newline at end of file diff --git a/build_pyinstaller.py b/build_pyinstaller.py deleted file mode 100644 index cdfc8cd..0000000 --- a/build_pyinstaller.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -Pyinstaller script to move stuff, rename, and also make a cleaner output folder -""" - -import os, shutil, sys -from PyInstaller.__main__ import generate_parser, run # type: ignore -from speech_translate._version import __version__ - - -def run_makespec(filenames, **opts): - print(">> Generating spec file...") - # Split pathex by using the path separator - temppaths = opts["pathex"][:] - pathex = opts["pathex"] = [] - for p in temppaths: - pathex.extend(p.split(os.pathsep)) - - import PyInstaller.building.makespec # type: ignore - - spec_file = PyInstaller.building.makespec.main(filenames, **opts) - return spec_file - - -def get_env_name(): - return os.path.basename(sys.prefix) - - -def get_base_prefix_compat(): - """Get base/real prefix, or sys.prefix if there is none.""" - return getattr(sys, "base_prefix", None) or getattr(sys, "real_prefix", None) or sys.prefix - - -def in_virtualenv(): - return get_base_prefix_compat() != sys.prefix - - -if not in_virtualenv(): - print("Please run this script in a virtual environment") - sys.exit(1) - -options = [ - "Run.py", - "-c", # console window. Console window cannot be hidden because it will cause error on whisper transformer logging - "--clean", - "--noconfirm", - "--additional-hooks-dir=./_pyinstaller_hooks", - "--runtime-hook=./_pyinstaller_hooks/add_lib.py", - "--icon=./speech_translate/assets/icon.ico", - "--add-data=./speech_translate/theme;speech_translate/theme", - "--add-data=./speech_translate/assets;speech_translate/assets", - "--add-data=./LICENSE.txt;.", - f"--add-data={get_env_name()}/Lib/site-packages/whisper/assets;whisper/assets/", - "--copy-metadata=tqdm", - "--copy-metadata=regex", - "--copy-metadata=requests", - "--copy-metadata=packaging", - "--copy-metadata=filelock", - "--copy-metadata=numpy", - "--copy-metadata=tokenizers", - "--exclude-module=pyinstaller", -] - -print(f"Currently running in virtual environment {get_env_name()} using python {sys.version}") -specName = f"SpeechTranslate {__version__}" -argsName = f"-n{specName}" # name of the spec file - -options.append(argsName) -# ----------------- -# make spec file -parser = generate_parser() -args = parser.parse_args(options) -run_makespec(**vars(args)) - -# Edit spec folder -folderName = f"{specName} {get_env_name()}" -specFile = f"{specName}.spec" -spec = "" -with open(specFile, "r") as f: - spec = f.read() - # add recursion limit after copy_metadata - spec = spec.replace("copy_metadata", "copy_metadata\nimport sys\nsys.setrecursionlimit(5000)", 1) - # rename the exe file - spec = spec.replace(f"name='{specName}'", f"name='SpeechTranslate'", 1) - # rename the build folder name, add venv name to it - spec = spec.replace(f"name='{specName}'", f"name='{folderName}'", 1) - -# write spec file -with open(specFile, "w") as f: - f.write(spec) - -# create license.txt file -with open("LICENSE", "r") as f: - license = f.read() - with open("LICENSE.txt", "w") as f2: - f2.write(license) - -# run pyinstaller -run([specFile, "--noconfirm", "--clean"]) - -# delete license.txt file -print(">> Deleting created license.txt file") -os.remove("LICENSE.txt") - -output_folder = f"dist/{folderName}" - -# create lib folder in output folder -lib_folder = f"{output_folder}/lib" -os.mkdir(lib_folder) - -# move all .dll .pyd files to lib folder with some whitelist -# whitelist some dll files and numpy dependencies (libopenblas) -print(">> Moving .dll files to lib folder") -dontMove = ["python3.dll", "python310.dll", "python38.dll", "python39.dll"] -for file in os.listdir(output_folder): - if file.endswith(".dll") or file.endswith(".pyd"): - if file not in dontMove and "libopenblas" not in file: - shutil.move(f"{output_folder}/{file}", f"{lib_folder}/{file}") - -# open folder -print(">> Opening output folder") -output_folder = os.path.abspath(output_folder) -try: - os.startfile(output_folder) -except Exception: - # linux - import subprocess - - subprocess.call(["xdg-open", output_folder]) diff --git a/devSetup.py b/devSetup.py deleted file mode 100644 index 892105c..0000000 --- a/devSetup.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import platform -import time - -pip = "pip" -req = "requirements" -tempfile = req + "_temp" -# check if not windows -if platform.system() != "Windows": - pip = "pip3" - -if __name__ == "__main__": - # ask if user want to use gpu or not - print("-" * 100) - print("This script will try to install the necessary packages for the project") - use_gpu = input("Do you want to use GPU for pytorch? (y/n): ") - - # read requirements.txt save as temp - with open(f"{req}.txt", "r") as f: - lines = f.readlines() - - if use_gpu.lower() != "y": - # remove line with --find-links - lines = [line for line in lines if not line.startswith("--find-links")] - - # write temp to requirements_temp.txt - with open(f"{tempfile}.txt", "w") as f: - f.writelines(lines) - - timeStart = time.time() - # install requirements - print("-" * 100) - print(f"Installing from {tempfile}.txt") - os.system(f"{pip} install -r {tempfile}.txt") - - # delete temp file - os.remove(f"{tempfile}.txt") - - print("-" * 100) - print("Done!") - print(f"Total time {time.time() - timeStart: .2f}") - print("-" * 100) - print("IF PYTORCH version is not compatible with your system, please install it manually with direction located at https://pytorch.org/") diff --git a/install_ffmpeg.ps1 b/install_ffmpeg.ps1 new file mode 100644 index 0000000..03f720a --- /dev/null +++ b/install_ffmpeg.ps1 @@ -0,0 +1,56 @@ +param ( + [switch]$webdl +) + +$isAdministrator = [Security.Principal.WindowsPrincipal]::new([Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) +$arguments = [System.Environment]::GetCommandLineArgs() + +# MUST BE RUN AS ADMINISTRATOR, but when run from a webdl, it will not be forced +if (-NOT $isAdministrator -AND -NOT $webdl) +{ + $arguments = "& '" +$myinvocation.mycommand.definition + "'" + Start-Process powershell -Verb runAs -ArgumentList $arguments + Break +} + +if (-NOT $isAdministrator) +{ + Write-Host "WARNING: This script must be run as administrator to correctly add ffmpeg to the system path." +} + +# modified a little from https://adamtheautomator.com/install-ffmpeg/ +New-Item -Type Directory -Path C:\ffmpeg +Set-Location C:\ffmpeg +curl.exe -L 'https://github.com/GyanD/codexffmpeg/releases/download/6.0/ffmpeg-6.0-essentials_build.zip' -o 'ffmpeg.zip' + +# Expand the Zip +Expand-Archive .\ffmpeg.zip -Force -Verbose + +# Move the executable (*.exe) files to the top folder +Get-ChildItem -Recurse -Path .\ffmpeg -Filter *.exe | +ForEach-Object { + $source = $_.FullName + $destination = Join-Path -Path . -ChildPath $_.Name + Move-Item -Path $source -Destination $destination -Force -Verbose +} + +# # Clean up +Write-Host "Cleaning up..." +Remove-Item .\ffmpeg\ -Recurse +Remove-Item .\ffmpeg.zip + +# List the directory contents +Get-ChildItem + +# Prepend the FFmpeg folder path to the system path variable +Write-Host "Adding ffmpeg to the system path..." +[System.Environment]::SetEnvironmentVariable( + "PATH", + "C:\ffmpeg\;$([System.Environment]::GetEnvironmentVariable('PATH','MACHINE'))", + "Machine" +) +Write-Host "ffmpeg has been added to the system path." + +$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + +Write-Host "check it by running ffmpeg -version" \ No newline at end of file diff --git a/installer.iss b/installer.iss new file mode 100644 index 0000000..8e34efb --- /dev/null +++ b/installer.iss @@ -0,0 +1,56 @@ +; Script generated by the Inno Setup Script Wizard. +; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES! + +#define MyAppName "Speech Translate" +#define MyAppVersion "1.3.0" +#define MyAppPublisher "Dadangdut33" +#define MyAppURL "https://github.com/Dadangdut33/Speech-Translate" +#define MyAppExeName "SpeechTranslate.exe" + +[Setup] +; NOTE: The value of AppId uniquely identifies this application. Do not use the same AppId value in installers for other applications. +; (To generate a new GUID, click Tools | Generate GUID inside the IDE.) +AppId={{EDE12D07-73B0-4B1F-91C3-A0ECE1AB3F7C} +AppName={#MyAppName} +AppVersion={#MyAppVersion} +;AppVerName={#MyAppName} {#MyAppVersion} +AppPublisher={#MyAppPublisher} +AppPublisherURL={#MyAppURL} +AppSupportURL={#MyAppURL} +AppUpdatesURL={#MyAppURL} +DefaultDirName={autopf}\{#MyAppName} +DefaultGroupName={#MyAppName} +AllowNoIcons=yes +LicenseFile=build\SpeechTranslate {#MyAppVersion}\LICENSE.txt +InfoBeforeFile=build\pre_install_note.txt +InfoAfterFile=build\post_install_note.txt +; Remove the following line to run in administrative install mode (install for all users.) +PrivilegesRequired=lowest +PrivilegesRequiredOverridesAllowed=commandline +OutputDir=dist +OutputBaseFilename=SpeechTranslate +SetupIconFile=speech_translate\assets\icon.ico +Compression=lzma +SolidCompression=yes +WizardStyle=modern + +[Languages] +Name: "english"; MessagesFile: "compiler:Default.isl" + +[Tasks] +Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: unchecked + +[Files] +Source: "build\SpeechTranslate {#MyAppVersion}\{#MyAppExeName}"; DestDir: "{app}"; Flags: ignoreversion +Source: "build\SpeechTranslate {#MyAppVersion}\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs createallsubdirs +; NOTE: Don't use "Flags: ignoreversion" on any shared system files + +[Icons] +Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}" +Name: "{group}\{cm:ProgramOnTheWeb,{#MyAppName}}"; Filename: "{#MyAppURL}" +Name: "{group}\{cm:UninstallProgram,{#MyAppName}}"; Filename: "{uninstallexe}" +Name: "{autodesktop}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; Tasks: desktopicon + +[Run] +Filename: "{app}\{#MyAppExeName}"; Description: "{cm:LaunchProgram,{#StringChange(MyAppName, '&', '&&')}}"; Flags: nowait postinstall skipifsilent + diff --git a/preview/1.png b/preview/1.png new file mode 100644 index 0000000..c8b14db Binary files /dev/null and b/preview/1.png differ diff --git a/preview/10.png b/preview/10.png new file mode 100644 index 0000000..2d32de5 Binary files /dev/null and b/preview/10.png differ diff --git a/preview/11.png b/preview/11.png new file mode 100644 index 0000000..da3d07d Binary files /dev/null and b/preview/11.png differ diff --git a/preview/12.png b/preview/12.png new file mode 100644 index 0000000..1ee87e2 Binary files /dev/null and b/preview/12.png differ diff --git a/preview/13.png b/preview/13.png new file mode 100644 index 0000000..c015ef6 Binary files /dev/null and b/preview/13.png differ diff --git a/preview/14.png b/preview/14.png new file mode 100644 index 0000000..28e2c8f Binary files /dev/null and b/preview/14.png differ diff --git a/preview/2.png b/preview/2.png new file mode 100644 index 0000000..6788c4c Binary files /dev/null and b/preview/2.png differ diff --git a/preview/3.png b/preview/3.png new file mode 100644 index 0000000..8be00fd Binary files /dev/null and b/preview/3.png differ diff --git a/preview/4.png b/preview/4.png new file mode 100644 index 0000000..60ae6f3 Binary files /dev/null and b/preview/4.png differ diff --git a/preview/5.png b/preview/5.png new file mode 100644 index 0000000..1e5df6a Binary files /dev/null and b/preview/5.png differ diff --git a/preview/6.png b/preview/6.png new file mode 100644 index 0000000..6633b71 Binary files /dev/null and b/preview/6.png differ diff --git a/preview/7.png b/preview/7.png new file mode 100644 index 0000000..eb7b5d1 Binary files /dev/null and b/preview/7.png differ diff --git a/preview/8.png b/preview/8.png new file mode 100644 index 0000000..e5402a3 Binary files /dev/null and b/preview/8.png differ diff --git a/preview/9.png b/preview/9.png new file mode 100644 index 0000000..0febf63 Binary files /dev/null and b/preview/9.png differ diff --git a/requirements.txt b/requirements.txt index e861452..0e18dff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,26 @@ torch torchvision torchaudio -deep-translator==1.11.0 +deep-translator==1.11.4 notify-py==0.3.42 +loguru pillow==9.5.0 pywin32==306; platform_system == "Windows" -PyAudioWPatch==0.2.12.5; platform_system == "Windows" +PyAudioWPatch==0.2.12.6; platform_system == "Windows" PyAudio==0.2.13; platform_system != "Windows" -pystray==0.19.4 -requests==2.28.2 -scipy==1.10.1 +pystray==0.19.5 +tkhtmlview==0.2.0 +tksheet==6.2.9 +requests==2.31.0 +scipy==1.11.3 sounddevice==0.4.6 -soundfile==0.11.0 +soundfile==0.12.1 +webrtcvad==2.0.10 darkdetect==0.8.0 arabic-reshaper==3.0.0 -openai-whisper==20230314 -whisper-timestamped @ git+https://github.com/linto-ai/whisper-timestamped.git \ No newline at end of file +python-bidi==0.4.2 +matplotlib==3.8.0 +onnxruntime==1.16.1 +demucs==4.0.1 +stable-ts @ git+https://github.com/jianfch/stable-ts.git@5c512a1880b937025792d441b98f5a13ab5a735e +faster-whisper==0.9.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 49950d0..7b0ffab 100644 --- a/setup.py +++ b/setup.py @@ -16,13 +16,13 @@ def install_requires(): with open("requirements.txt", "r", encoding="utf-8") as f: req = f.read().splitlines() return req - -print(install_requires()) + setup( name="SpeechTranslate", version=version(), - description="A realtime speech transcription and translation application using Whisper OpenAI and free translation API. Interface made using Tkinter. Code written fully in Python.", + description="A realtime speech transcription and translation application using Whisper OpenAI and free translation API." + " Interface made using Tkinter. Code written fully in Python.", long_description=read_me(), long_description_content_type="text/markdown", python_requires=">=3.8", @@ -32,10 +32,16 @@ def install_requires(): packages=[ "speech_translate", "speech_translate.utils", - "speech_translate.components", - "speech_translate.components.abstract", - "speech_translate.components.custom", - "speech_translate.components.window", + "speech_translate.utils.audio", + "speech_translate.utils.translate", + "speech_translate.utils.tk", + "speech_translate.utils.whisper", + "speech_translate.ui", + "speech_translate.ui.template", + "speech_translate.ui.custom", + "speech_translate.ui.window", + "speech_translate.ui.frame", + "speech_translate.ui.frame.settings", "speech_translate.assets", "speech_translate.theme", "speech_translate.theme.skip", @@ -50,10 +56,8 @@ def install_requires(): "speech_translate.theme.sv.resource": ["*"], }, install_requires=install_requires(), - entry_points={ - "console_scripts": [ - "speech-translate=speech_translate.__main__:main", - ] - }, + entry_points={"console_scripts": [ + "speech-translate=speech_translate.__main__:main", + ]}, include_package_data=True, ) diff --git a/speech_translate/__main__.py b/speech_translate/__main__.py index 02c6c71..2361165 100644 --- a/speech_translate/__main__.py +++ b/speech_translate/__main__.py @@ -1,28 +1,4 @@ -import platform - -from ._version import __version__ -from .custom_logging import logger - -from .components.window.main import MainWindow, AppTray, get_gpu_info, check_cuda_and_gpu -from .components.window.about import AboutWindow -from .components.window.log import LogWindow -from .components.window.setting import SettingWindow -from .components.window.transcribed import TcsWindow -from .components.window.translated import TlsWindow - -def main(): - logger.info(f"App Version: {__version__}") - logger.info(f"OS: {platform.system()} {platform.release()} {platform.version()} | CPU: {platform.processor()}") - logger.info(f"GPU: {get_gpu_info()} | CUDA: {check_cuda_and_gpu()}") - # --- GUI --- - AppTray() # Start tray app in the background - main = MainWindow() - TcsWindow(main.root) - TlsWindow(main.root) - SettingWindow(main.root) - LogWindow(main.root) - AboutWindow(main.root) - main.root.mainloop() # Start main app +from .ui.window.main import main if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/speech_translate/_constants.py b/speech_translate/_constants.py new file mode 100644 index 0000000..947ccae --- /dev/null +++ b/speech_translate/_constants.py @@ -0,0 +1,6 @@ +APP_NAME: str = "Speech Translate" +SUBTITLE_PLACEHOLDER = " " * 100 +PREVIEW_WORDS = "1234567 Preview Hello مرحبًا プレビュー こんにちは 预习 你好 привет" +WHISPER_SR = 16_000 +MIN_THRESHOLD = -61 +MAX_THRESHOLD = 1 diff --git a/speech_translate/_contants.py b/speech_translate/_contants.py deleted file mode 100644 index 61450ad..0000000 --- a/speech_translate/_contants.py +++ /dev/null @@ -1,4 +0,0 @@ -APP_NAME: str = "Speech Translate" -RESHAPE_LANG_LIST = ["arabic", "urdu", "faroese"] -SUBTITLE_PLACEHOLDER = " " * 100 -PREVIEW_WORDS = "1234567 Preview プレビュー 预习 предварительный просмотр" \ No newline at end of file diff --git a/speech_translate/_logging.py b/speech_translate/_logging.py new file mode 100644 index 0000000..4ef34d2 --- /dev/null +++ b/speech_translate/_logging.py @@ -0,0 +1,136 @@ +import os +import re +import sys +from time import strftime + +from loguru import logger +from ._path import dir_log + +# ------------------ # +current_log: str = f"{strftime('%Y-%m-%d %H-%M-%S')}.log" +# make sure log folder exist +if not os.path.exists(dir_log): + try: + os.makedirs(dir_log) + except Exception as e: + print("Error: Cannot create log folder") + print(e) + + +def shorten_progress_bar(match): + percentage = match.group(1) + bar = "#" * len(percentage) # make it a bit longer + return f"{percentage} | {bar} |" + + +# class StreamStdoutToLogger(object): +# """ +# Fake file-like stream object that redirects writes to a logger instance. +# """ +# def __init__(self, level): +# self.level = level +# self.ignore_list = [] + +# def write(self, buf): +# for line in buf.rstrip().splitlines(): +# line = line.strip() + +# # ignore if any keywords from ignore_list is in the line +# if any(x in line for x in self.ignore_list): +# continue + +# # checking if line is empty. exception use ^ ~ to point out the error +# # but we don't need it in logger because logger is per line +# check_empty = line.replace("^", "").replace("~", "").strip() +# if len(check_empty) == 0: +# continue + +# logger.log(self.level, line) + +# def flush(self): +# pass + +recent_stderr = [] + + +class StreamStderrToLogger(object): + """ + For stderr and tqdm progress bar + """ + def __init__(self, level): + self.level = level + # tqdm use stderr to print, so we should consider it as info + self.considered_info = [ + "Downloading", "Fetching", "run_threaded", "Estimating duration from bitrate, this may be inaccurate", + "Transcribe", "Translate", "Refine", "Align", "Running", "done" + ] + + def write(self, buf): + for line in buf.rstrip().splitlines(): + line = line.strip().replace("", "") + + # checking if line is empty. exception use ^ ~ to point out the error + # but we don't need it in logger because logger is per line + check_empty = line.replace("^", "").replace("~", "").strip() + if len(check_empty) == 0: + continue + + # check where is it from. if keywords from considered_info is in the line then log as info + if any(x in line for x in self.considered_info): + shorten = re.sub(r'(\d+%)(\s*)\|(.+?)\|', shorten_progress_bar, line) + logger.log("INFO", shorten) + recent_stderr.append(shorten) + else: + logger.log(self.level, line) + recent_stderr.append(line) + + # limit to max 10 + if len(recent_stderr) > 10: + recent_stderr.pop(0) + + def flush(self): + pass + + +log_format = '{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <7} | {file}:{line} [{thread.name}] - {message}' +stdout_id = None +file_id = None + + +def init_logging(level): + global stdout_id, file_id + # reset logger + logger.remove() + + # add handler + stdout_id = logger.add(sys.stderr, level=level, backtrace=False, diagnose=True, format=log_format) + file_id = logger.add( + dir_log + "/" + current_log, level="DEBUG", encoding="utf-8", backtrace=False, diagnose=True, format=log_format + ) + + # sys.stdout = StreamStdoutToLogger("INFO") + sys.stderr = StreamStderrToLogger("ERROR") + # tqdm use stderr so we also need to redirect it + # stderr might be more informative in its original form so you can comment it out if you want when developing + + +def change_log_level(level: str): + global current_log, stdout_id, file_id + logger.remove(stdout_id) + stdout_id = logger.add(sys.stdout, level=level, backtrace=False, diagnose=True) + + logger.remove(file_id) + file_id = logger.add(dir_log + "/" + current_log, level=level, encoding="utf-8", backtrace=False, diagnose=True) + + +# def update_stdout_ignore_list(ignore_list): +# assert isinstance(sys.stdout, StreamStdoutToLogger) +# sys.stdout.ignore_list = ignore_list + + +def clear_current_log_file(): + global current_log, stdout_id, file_id + logger.remove(file_id) + with open(dir_log + "/" + current_log, "w") as f: + f.write("") + file_id = logger.add(dir_log + "/" + current_log, level="DEBUG", encoding="utf-8", backtrace=False, diagnose=True) diff --git a/speech_translate/_path.py b/speech_translate/_path.py index 0040712..aebc93b 100644 --- a/speech_translate/_path.py +++ b/speech_translate/_path.py @@ -2,17 +2,23 @@ # Paths dir_project: str = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)))) -dir_user: str = os.path.abspath(os.path.join(dir_project, "user")) +dir_user: str = os.path.abspath(os.path.join(dir_project, "_user")) dir_theme: str = os.path.abspath(os.path.join(dir_project, "theme")) -dir_setting: str = os.path.abspath(os.path.join(dir_project, "setting")) dir_temp: str = os.path.abspath(os.path.join(dir_project, "temp")) +dir_debug: str = os.path.abspath(os.path.join(dir_project, "debug")) dir_log: str = os.path.abspath(os.path.join(dir_project, "log")) dir_assets: str = os.path.abspath(os.path.join(dir_project, "assets")) dir_export: str = os.path.abspath(os.path.join(dir_project, "export")) +dir_refinement: str = os.path.abspath(os.path.join(dir_export, "refinement")) +dir_translate: str = os.path.abspath(os.path.join(dir_export, "translated")) +dir_alignment: str = os.path.abspath(os.path.join(dir_export, "alignment")) app_icon: str = os.path.abspath(os.path.join(dir_assets, "icon.ico")) +splash_image: str = os.path.abspath(os.path.join(dir_assets, "splash.png")) +parameters_text: str = os.path.abspath(os.path.join(dir_assets, "parameter.txt")) +ffmpeg_ps_script: str = os.path.abspath(os.path.join(dir_project, "..", "install_ffmpeg.ps1")) # verify app_icon exist or not -if not os.path.exists(app_icon): +if not os.path.exists(app_icon): app_icon_missing = True else: app_icon_missing = False diff --git a/speech_translate/_version.py b/speech_translate/_version.py index 3f817f9..23bee39 100644 --- a/speech_translate/_version.py +++ b/speech_translate/_version.py @@ -1,2 +1,2 @@ -__version__ = "1.2.3" -__setting_version__ = "1.1.0" # only updated on major changes +__version__ = "1.3.0" +__setting_version__ = "1.3.0" # only updated on major changes diff --git a/speech_translate/assets/1.png b/speech_translate/assets/1.png deleted file mode 100644 index bba7511..0000000 Binary files a/speech_translate/assets/1.png and /dev/null differ diff --git a/speech_translate/assets/2.png b/speech_translate/assets/2.png deleted file mode 100644 index f2420ea..0000000 Binary files a/speech_translate/assets/2.png and /dev/null differ diff --git a/speech_translate/assets/3.png b/speech_translate/assets/3.png deleted file mode 100644 index e6e14ec..0000000 Binary files a/speech_translate/assets/3.png and /dev/null differ diff --git a/speech_translate/assets/4.png b/speech_translate/assets/4.png deleted file mode 100644 index d176888..0000000 Binary files a/speech_translate/assets/4.png and /dev/null differ diff --git a/speech_translate/assets/5.png b/speech_translate/assets/5.png deleted file mode 100644 index d35129d..0000000 Binary files a/speech_translate/assets/5.png and /dev/null differ diff --git a/speech_translate/assets/6.png b/speech_translate/assets/6.png deleted file mode 100644 index 364204f..0000000 Binary files a/speech_translate/assets/6.png and /dev/null differ diff --git a/speech_translate/assets/7.png b/speech_translate/assets/7.png deleted file mode 100644 index a33d577..0000000 Binary files a/speech_translate/assets/7.png and /dev/null differ diff --git a/speech_translate/assets/parameter.txt b/speech_translate/assets/parameter.txt new file mode 100644 index 0000000..a3a0138 --- /dev/null +++ b/speech_translate/assets/parameter.txt @@ -0,0 +1,238 @@ +Command line arguments to be used. (Usage value shown as example here are only for reference). + +For more information, see https://github.com/jianfch/stable-ts or https://github.com/Dadangdut33/Speech-Translate/wiki +# [command] +* description of command +* type: data type, default xxx +* usage: --command xxx + +# [device] +* description: device to use for PyTorch inference (A Cuda compatible GPU and PyTorch with CUDA support are still required for GPU / CUDA) +* type: str, default cuda +* usage: --device cpu + +# [cpu_preload] +* description: load model into CPU memory first then move model to specified device; this reduces GPU memory usage when loading model. +* type: bool, default True +* usage: --cpu_preload True + +# [dynamic_quantization] +* description: whether to apply Dynamic Quantization to model to reduce memory usage (~half less) and increase inference speed at cost of slight decrease in accuracy; Only for CPU; NOTE: overhead might make inference slower for models smaller than 'large' +* type: bool, default False +* usage: --dynamic_quantization + +# [prepend_punctuations] +* description: Punctuations to prepend to the next word +* type: str, default "'“¿([{-" +* usage: --prepend_punctuations "" + +# [append_punctuations] +* description: Punctuations to append to the previous word +* type: str, default ""'.。,,!!??::”)]}、" +* usage: --append_punctuations "" + +# [gap_padding] +* description: padding to prepend to each segment for word timing alignment; used to reduce the probability of the model predicting timestamps earlier than the first utterance +* type: str, default " ..." +* usage: --gap_padding "padding" + +# [word_timestamps] +* description: extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment; disabling this will prevent segments from splitting/merging properly. +* type: bool, default True +* usage: --word_timestamps True + +# [regroup] +* description: whether to regroup all words into segments with more natural boundaries; specify a string for customizing the regrouping algorithm; ignored if [word_timestamps]=False. +* type: str, default "True" +* usage: --regroup "regroup_option" + +# [ts_num] +* description: number of extra inferences to perform to find the mean timestamps +* type: int, default 0 +* usage: --ts_num + +# [ts_noise] +* description: percentage of noise to add to audio_features to perform inferences for [ts_num] +* type: float, default 0.1 +* usage: --ts_noise 0.1 + +# [suppress_silence] +* description: whether to suppress timestamps where audio is silent at segment-level and word-level if [suppress_word_ts]=True +* type: bool, default True +* usage: --suppress_silence True + +# [suppress_word_ts] +* description: whether to suppress timestamps where audio is silent at word-level; ignored if [suppress_silence]=False +* type: bool, default True +* usage: --suppress_word_ts True + +# [suppress_ts_tokens] +* description: whether to use silence mask to suppress silent timestamp tokens during inference; increases word accuracy in some cases, but tends to reduce 'verbatimness' of the transcript; ignored if [suppress_silence]=False +* type: bool, default False +* usage: --suppress_ts_tokens True + +# [q_levels] +* description: quantization levels for generating timestamp suppression mask; acts as a threshold to marking sound as silent; fewer levels will increase the threshold of volume at which to mark a sound as silent +* type: int, default 20 +* usage: --q_levels + +# [k_size] +* description: Kernel size for average pooling waveform to generate suppression mask; recommend 5 or 3; higher sizes will reduce detection of silence +* type: int, default 5 +* usage: --k_size 5 + +# [time_scale] +* description: factor for scaling audio duration for inference; greater than 1.0 'slows down' the audio; less than 1.0 'speeds up' the audio; 1.0 is no scaling +* type: float +* usage: --time_scale + +# [vad] +* description: whether to use Silero VAD to generate timestamp suppression mask; Silero VAD requires PyTorch 1.12.0+; Official repo: https://github.com/snakers4/silero-vad +* type: bool, default False +* usage: --vad True + +# [vad_threshold] +* description: threshold for detecting speech with Silero VAD. (Default: 0.35); low threshold reduces false positives for silence detection +* type: float, default 0.35 +* usage: --vad_threshold 0.35 + +# [vad_onnx] +* description: whether to use ONNX for Silero VAD +* type: bool, default False +* usage: --vad_onnx True + +# [min_word_dur] +* description: only allow suppressing timestamps that result in word durations greater than this value +* type: float, default 0.1 +* usage: --min_word_dur 0.1 + +# [max_chars] +* description: maximum number of characters allowed in each segment +* type: int +* usage: --max_chars + +# [max_words] +* description: maximum number of words allowed in each segment +* type: int +* usage: --max_words + +# [demucs] +* description: whether to reprocess the audio track with Demucs to isolate vocals/remove noise; Demucs official repo: https://github.com/facebookresearch/demucs +* type: bool, default False +* usage: --demucs True + +# [only_voice_freq] +* description: whether to only use sound between 200 - 5000 Hz, where the majority of human speech is. +* type: bool +* usage: --only_voice_freq True + +# [strip] +* description: whether to remove spaces before and after text on each segment for output +* type: bool, default True +* usage: --strip True + +# [tag] +* description: a pair of tags used to change the properties of a word at its predicted time; SRT Default: '', ''; VTT Default: '', ''; ASS Default: '{\1c&HFF00&}', '{\r}' +* type: str +* usage: --tag " " + +# [reverse_text] +* description: whether to reverse the order of words for each segment of text output +* type: bool, default False +* usage: --reverse_text True + +# [font] +* description: word font for ASS output(s) +* type: str, default 'Arial' +* usage: --font "" + +# [font_size] +* description: word font size for ASS output(s) +* type: int, default 48 +* usage: --font_size 48 + +# [karaoke] +* description: whether to use progressive filling highlights for karaoke effect (only for ASS outputs) +* type: bool, default False +* usage: --karaoke True + +# [temperature] +* description: temperature to use for sampling +* type: float, default 0 +* usage: --temperature + +# [best_of] +* description: number of candidates when sampling with non-zero temperature +* type: int +* usage: --best_of + +# [beam_size] +* description: number of beams in beam search, only applicable when temperature is zero +* type: int +* usage: --beam_size + +# [patience] +* description: optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search +* type: float +* usage: --patience + +# [length_penalty] +* description: optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default +* type: float +* usage: --length_penalty + +# [fp16] +* description: whether to perform inference in fp16; True by default +* type: bool, default True +* usage: --fp16 + +# [compression_ratio_threshold] +* description: if the gzip compression ratio is higher than this value, treat the decoding as failed +* type: float +* usage: --compression_ratio_threshold + +# [logprob_threshold] +* description: if the average log probability is lower than this value, treat the decoding as failed +* type: float +* usage: --logprob_threshold + +# [no_speech_threshold] +* description: if the probability of the token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence +* type: float, default 0.6 +* usage: --no_speech_threshold 0.6 + +# [threads] +* description: number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS +* type: int +* usage: --threads + +# [mel_first] +* description: process the entire audio track into a log-Mel spectrogram first instead in chunks +* type: bool +* usage: --mel_first + +# [demucs_option] +* description: Extra option(s) to use for Demucs; Replace True/False with 1/0; E.g. --demucs_option "shifts=3" --demucs_option "overlap=0.5" +* type: str +* usage: --demucs_option "