diff --git a/.gitignore b/.gitignore index 47b1995..221421a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,145 +1,148 @@ -# Runtime data -./devruntime - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -!pkg/agent/tasks/lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# ignore PyCharm IDE preferences -.idea/ - -# Random test files / specs -*.tmp -cookies -*.sqlite -**/pgdata -client/ -generated/ -**/corpus_count.json +# Runtime data +./devruntime + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +!pkg/agent/tasks/lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# ignore PyCharm IDE preferences +.idea/ + +# Random test files / specs +*.tmp +cookies +*.sqlite +**/pgdata +client/ +generated/ +**/corpus_count.json +.vscode/settings.json +devdocker/devdocker/docker-compose.override.yml +devdocker/devdocker/docker-compose.yml diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..fbc75d0 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "python.formatting.provider": "none" +} \ No newline at end of file diff --git a/devdocker/docker-compose.override.yml b/devdocker/docker-compose.override.yml new file mode 100644 index 0000000..73e132f --- /dev/null +++ b/devdocker/docker-compose.override.yml @@ -0,0 +1,59 @@ +version: "3.4" + + + +services: + frontend: + image: classtranscribe/frontend:staging + build: + context: ../../FrontEnd + dockerfile: ./Dockerfile + + api: + image: classtranscribe/api:staging + build: + context: ../../WebAPI + target: publish + dockerfile: ./API.Dockerfile + environment: + - LogEntityFrameworkSQL=true + + + taskengine: + image: classtranscribe/taskengine:staging + build: + context: ../../WebAPI + target: publish + dockerfile: ./TaskEngine.Dockerfile + + pythonrpcserver: + image: classtranscribe/pythonrpcserver:staging + build: + context: ../../WebAPI + dockerfile: ./pythonrpcserver.Dockerfile + + scenedetection: + image: classtranscribe/ct-python:staging + build: + context: ../ + dockerfile: ./Dockerfile + environment: + RABBITMQ_URI: 'amqp://mahipal2%40illinois.edu:Test123@rabbitmq:5672/%2f' + + phrasehinter: + image: classtranscribe/ct-python:staging + # No need to specify a build here; scene detection will build the image + environment: + RABBITMQ_URI: 'amqp://mahipal2%40illinois.edu:Test123@rabbitmq:5672/%2f' + + glossary: + image: classtranscribe/ct-python:staging + # No need to specify a build here; scene detection will build the image + environment: + RABBITMQ_URI: 'amqp://mahipal2%40illinois.edu:Test123@rabbitmq:5672/%2f' + + crawler: + image: classtranscribe/ct-python:staging + # No need to specify a build here; scene detection will build the image + environment: + RABBITMQ_URI: 'amqp://mahipal2%40illinois.edu:Test123@rabbitmq:5672/%2f' diff --git a/devdocker/docker-compose.yml b/devdocker/docker-compose.yml new file mode 100644 index 0000000..a448981 --- /dev/null +++ b/devdocker/docker-compose.yml @@ -0,0 +1,238 @@ +version: "3.4" + +services: + + # webserver to handle all traffic. This can use let's encrypt to generate a SSL cert. + traefik: + image: traefik:v1.7 + command: + - --loglevel=INFO + - --api + # Entrypoints + - --defaultentrypoints=https,http + - --entryPoints=Name:http Address::${TRAEFIK_HTTP_PORT:-8000} ${TRAEFIK_HTTP_REDIRECT:-""} + - --entryPoints=Name:https Address::${TRAEFIK_HTTPS_PORT:-8443} ${TRAEFIK_HTTPS_OPTIONS:-TLS} + # Configuration for acme (https://letsencrypt.org/) + - --acme=${TRAEFIK_ACME_ENABLE:-false} + #- --acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory + - --acme.email=${ADMIN_USER_ID:-""} + - --acme.entrypoint=https + - --acme.onhostrule=true + - --acme.storage=/config/acme.json + - --acme.httpchallenge.entrypoint=http + - --acme.storage=/config/acme.json + - --acme.acmelogging=true + - --acme.domains=${HOST_NAME:-""} + - --acme.domains=${TRAEFIK_ADDITIONAL_HOST1:-""} + - --acme.domains=${TRAEFIK_ADDITIONAL_HOST2:-""} + # DOCKER + - --docker=true + - --docker.endpoint=unix:///var/run/docker.sock + - --docker.exposedbydefault=false + - --docker.watch=true + # - --docker.domain=docker.localhost + #restart: unless-stopped + ports: + - "${TRAEFIK_HTTP_PORT-8000}:${TRAEFIK_HTTP_PORT:-8000}" + - "${TRAEFIK_HTTPS_PORT-8443}:${TRAEFIK_HTTPS_PORT:-8443}" + labels: + - "traefik.enable=true" + - "traefik.backend=traefik" + - "traefik.port=8080" + - "traefik.frontend.rule=PathPrefixStrip: /traefik" + - "traefik.website.frontend.whiteList.sourceRange=${TRAEFIK_IPFILTER:-172.16.0.0/12}" + # - "traefik.http.middlewares.test-auth.basicauth.users=test:$$apr1$$H6uskkkW$$IgXLP6ewTrSuBkTrqE8wj/" + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - ${DATA:-~/docker_data}/traefik:/config + container_name: "traefik" + + db: + image: postgres:11.7 + volumes: + - "${DATA:-~/docker_data}/pgvolume:/var/lib/postgresql/data" + ports: + - "127.0.0.1:5432:5432" + env_file: + - ".env" + environment: + - POSTGRES_USER=${ADMIN_USER_ID:-guest} + - POSTGRES_PASSWORD=${ADMIN_PASSWORD:-guest} + container_name: "db" + + pgadmin: + image: dpage/pgadmin4:4.11 + depends_on: + - db + - traefik + volumes: + - "${DATA:-~/docker_data}/pga4volume:/var/lib/pgadmin" + env_file: + - ".env" + environment: + - PGADMIN_DEFAULT_EMAIL=${ADMIN_USER_ID:-guest} + - PGADMIN_DEFAULT_PASSWORD=${ADMIN_PASSWORD:-guest} + container_name: "pgadmin" + labels: + - "traefik.enable=true" + - "traefik.backend=pgadmin" + - "traefik.port=80" + - "traefik.frontend.rule=PathPrefix:/pgadmin" + - "traefik.website.frontend.whiteList.sourceRange=${TRAEFIK_IPFILTER:-172.16.0.0/12}" + + rabbitmq: + image: rabbitmq:3.8.19-management + container_name: "rabbitmq" + env_file: + - ".env" + #restart: unless-stopped + environment: + # - RABBITMQ_CONFIG_FILE=/data/config/rabbitmq + - RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS=-rabbitmq_management path_prefix "/rabbitmq" + - RABBITMQ_DEFAULT_USER=${ADMIN_USER_ID:-guest} + - RABBITMQ_DEFAULT_PASS=${ADMIN_PASSWORD:-guest} + labels: + - "traefik.enable=true" + - "traefik.backend=rabbitmq" + - "traefik.port=15672" + - "traefik.frontend.rule=PathPrefix:/rabbitmq" + - "traefik.website.frontend.whiteList.sourceRange=${TRAEFIK_IPFILTER:-172.16.0.0/12}" + + pythonrpcserver: + image: classtranscribe/pythonrpcserver:staging + container_name: "pythonrpcserver" + env_file: + - ".env" + volumes: + - "${DATA:-~/docker_data}/data:/data" + ports: + - "127.0.0.1:50051:50051" + + api: + image: classtranscribe/api:staging + volumes: + - "${DATA:-~/docker_data}/data:/data" + container_name: "api" + depends_on: + - db + - rabbitmq + - traefik + env_file: + - ".env" + labels: + - "traefik.enable=true" + - "traefik.backend=api" + - "traefik.port=80" + - "traefik.frontend.rule=PathPrefix: /api/,/data/,/swag" + - "traefik.website.frontend.whiteList.sourceRange=${TRAEFIK_IPFILTER:-172.16.0.0/12}" + command: "/wait-for rabbitmq:5672 --timeout=50 -- dotnet /app/ClassTranscribeServer.dll" + + taskengine: + image: classtranscribe/taskengine:staging + volumes: + - "${DATA:-~/docker_data}/data:/data" + container_name: "taskengine" + depends_on: + - db + - rabbitmq + - pythonrpcserver + env_file: + - ".env" + command: "/wait-for rabbitmq:5672 --timeout=50 -- dotnet /app/TaskEngine.dll" + + + frontend: + image: classtranscribe/frontend:staging + env_file: + .env + depends_on: + - traefik + #- api + labels: + - "traefik.enable=true" + - "traefik.backend=frontend" + - "traefik.port=80" + - "traefik.frontend.rule=PathPrefix: /" + - "traefik.website.frontend.whiteList.sourceRange=${TRAEFIK_IPFILTER:-172.16.0.0/12}" + container_name: "frontend" + volumes: + - "${DATA:-~/docker_data}/live:/build/live" + + elasticsearch: + container_name: elasticsearch + image: docker.elastic.co/elasticsearch/elasticsearch:7.6.2 + volumes: + - "${DATA:-~/docker_data}/elasticsearch:/usr/share/elasticsearch/data" + environment: + - xpack.monitoring.enabled=true + - xpack.watcher.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - discovery.type=single-node + + scenedetection: + image: classtranscribe/ct-python:staging + volumes: + - "${DATA:-~/docker_data}/data:/data" + container_name: scenedetection + depends_on: + - rabbitmq + - api + - taskengine + env_file: + - ".env" + environment: + PYTHONUNBUFFERED: "1" + DATA_DIRECTORY: "/data" + RABBITMQ_QUEUENAME: 'SceneDetection' + SCENE_DETECT_ALGORITHM_CLASS: "SvmPoly2" + SCENE_DETECT_ALGORITHM_MODULE: "pkg.agent.tasks.lib.scenedetection.svm_poly2" + + phrasehinter: + image: classtranscribe/ct-python:staging + volumes: + - "${DATA:-~/docker_data}/data:/data" + container_name: phrasehinter + depends_on: + - rabbitmq + - api + - taskengine + env_file: + - ".env" + environment: + PYTHONUNBUFFERED: "1" + DATA_DIRECTORY: "/data" + RABBITMQ_QUEUENAME: 'PhraseHinter' + + glossary: + image: classtranscribe/ct-python:staging + volumes: + - "${DATA:-~/docker_data}/data:/data" + container_name: "glossary" + depends_on: + - api + - rabbitmq + - taskengine + - db + env_file: + - ".env" + environment: + PYTHONUNBUFFERED: "1" + DATA_DIRECTORY: "/data" + RABBITMQ_QUEUENAME: 'AccessibleGlossary' + + crawler: + image: classtranscribe/ct-python:staging + volumes: + - "${DATA:-~/docker_data}/data:/data" + container_name: "crawler" + depends_on: + - api + - rabbitmq + - taskengine + - db + env_file: + - ".env" + environment: + PYTHONUNBUFFERED: "1" + DATA_DIRECTORY: "/data" + RABBITMQ_QUEUENAME: 'PythonCrawler' diff --git a/pkg/agent/tasks/SpeechToTextParser.py b/pkg/agent/tasks/SpeechToTextParser.py new file mode 100644 index 0000000..d0f55f2 --- /dev/null +++ b/pkg/agent/tasks/SpeechToTextParser.py @@ -0,0 +1,40 @@ +import requests +from .AbstractTask import AbstractTask, TaskNames +from pkg.agent.tasks.lib import whisper + +class SpeechToTextParser(AbstractTask): + @staticmethod + def get_name(): + return TaskNames.SpeechToTextParser + + def parse_video(self, video_id): + # fetch video (?) + video = self.get_video(video_id=video_id) + if video is None: + self.logger.error(' [%s] FAILED to lookup videoId=%s' % (video_id, video_id)) + return + # extract audio + audio = video["Audio"] + + # whisper parse + result = whisper.transcribe(audio) + return result + + + def run_task(self, body, emitter): + self.logger.info(' [.] AccessibleGlossary message recv\'d: %s' % body) + video_id = body['Data'] + parameters = body.get('TaskParameters', {}) + # force = parameters.get('Force', False) + # readonly = parameters.get('ReadOnly', False) + self.logger.info(' [%s] AccessibleGlossary started on videoId=%s...' % (video_id, video_id)) + + result = self.parse_video(video_id) + + # TODO: switch to actual api + requests.post(url='%s/api/Task/UpdatePhraseHints?videoId=%s' % (self.target_host, video_id), + headers={'Content-Type': 'application/json', 'Authorization': 'Bearer %s' % self.jwt}, + data=json.dumps({"phraseHints": phrase_hints})) + + + self.logger.info(' [%s] AccessibleGlossary complete!' % video_id) diff --git a/pkg/agent/tasks/lib/whisper.py b/pkg/agent/tasks/lib/whisper.py new file mode 100644 index 0000000..cddb930 --- /dev/null +++ b/pkg/agent/tasks/lib/whisper.py @@ -0,0 +1,9 @@ +import whisper + +class whisper: + def __init__(self) -> None: + self.model = whisper.load_model("base") + + def transcribe(self, audio): + result = self.model.transcribe(audio) + return result