Skip to content

Commit df817ca

Browse files
author
Pedro Teixeira
committed
Merged PR 2450: Added embeddings service code.
Added embeddings service code. We will potentially create a common lib to hold the code shared with SVM, but only if we decide to use both backends. Related work items: #4441
1 parent 387748d commit df817ca

File tree

14 files changed

+751
-84
lines changed

14 files changed

+751
-84
lines changed

.gitignore

Lines changed: 52 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,69 @@
1-
# Byte-compiled / optimized / DLL files
2-
__pycache__/
3-
*.py[cod]
4-
*$py.class
5-
6-
# C extensions
7-
*.so
8-
9-
# Distribution / packaging
10-
.Python
11-
env/
12-
build/
13-
develop-eggs/
14-
dist/
15-
downloads/
16-
eggs/
17-
.eggs/
18-
lib/
19-
lib64/
20-
parts/
21-
sdist/
22-
var/
23-
wheels/
24-
*.egg-info/
25-
.installed.cfg
26-
*.egg
27-
28-
# PyInstaller
29-
# Usually these files are written by a python script from a template
30-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
31-
*.manifest
32-
*.spec
33-
34-
# Installer logs
35-
pip-log.txt
36-
pip-delete-this-directory.txt
1+
# VS code
2+
.vscode/
373

38-
# Unit test / coverage reports
39-
htmlcov/
40-
.tox/
41-
.coverage
42-
.coverage.*
43-
.cache
44-
nosetests.xml
45-
coverage.xml
46-
*.cover
47-
.hypothesis/
4+
bin/
485

49-
# Translations
50-
*.mo
51-
*.pot
6+
# ---- PYTHON ----
7+
# Compiled Python
8+
*.pyc
9+
.cache/
10+
.pytest_cache/
5211

53-
# Django stuff:
54-
*.log
55-
local_settings.py
12+
# Package
13+
*.egg-info/
14+
venv/
5615

57-
# Flask stuff:
58-
instance/
59-
.webassets-cache
16+
**/TEST*.xml
6017

61-
# Scrapy stuff:
62-
.scrapy
18+
### Intellij ###
19+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
20+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
6321

64-
# Sphinx documentation
65-
docs/_build/
22+
# User-specific stuff:
23+
**/.idea/workspace.xml
24+
**/.idea/tasks.xml
25+
**/.idea/dictionaries
26+
**/.idea/vcs.xml
27+
**/.idea/jsLibraryMappings.xml
6628

67-
# PyBuilder
68-
target/
29+
# Sensitive or high-churn files:
30+
**/.idea/dataSources.ids
31+
**/.idea/dataSources.xml
32+
**/.idea/dataSources.local.xml
33+
**/.idea/sqlDataSources.xml
34+
**/.idea/dynamic.xml
35+
**/.idea/uiDesigner.xml
36+
**/.idea/inspectionProfiles/
6937

70-
# Jupyter Notebook
71-
.ipynb_checkpoints
38+
# Gradle:
39+
**/.idea/gradle.xml
40+
**/.idea/libraries
7241

73-
# pyenv
74-
.python-version
42+
# Mongo Explorer plugin:
43+
**/.idea/mongoSettings.xml
7544

76-
# celery beat schedule file
77-
celerybeat-schedule
45+
## File-based project format:
46+
**/*.iws
7847

79-
# SageMath parsed files
80-
*.sage.py
48+
## Plugin-specific files:
8149

82-
# dotenv
83-
.env
50+
# IntelliJ
51+
/out/
8452

85-
# virtualenv
86-
.venv
87-
venv/
88-
ENV/
53+
# mpeltonen/sbt-idea plugin
54+
**/.idea_modules/
8955

90-
# Spyder project settings
91-
.spyderproject
92-
.spyproject
56+
# JIRA plugin
57+
atlassian-ide-plugin.xml
9358

94-
# Rope project settings
95-
.ropeproject
59+
# Crashlytics plugin (for Android Studio and IntelliJ)
60+
com_crashlytics_export_strings.xml
61+
crashlytics.properties
62+
crashlytics-build.properties
63+
fabric.properties
9664

97-
# mkdocs documentation
98-
/site
65+
### Intellij Patch ###
66+
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
9967

100-
# mypy
101-
.mypy_cache/
68+
# *.iml
69+
# modules.xml

src/.flake8

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[flake8]
2+
max-line-length=99
3+
max-complexity=10

src/.style.yapf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[style]
2+
based_on_style = pep8
3+
column_limit = 79

src/Dockerfile

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Base off of the Python image
2+
FROM python:3.5.2-slim
3+
LABEL maintainer "Pedro Teixeira <pedrotei@hutoma.com>"
4+
5+
RUN apt-get update \
6+
&& apt install -y --no-install-recommends \
7+
build-essential \
8+
&& rm -rf /var/lib/apt/lists/*
9+
10+
RUN pip install --no-cache-dir --upgrade pip
11+
# Copy common code
12+
COPY requirements.txt .
13+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
14+
15+
# Run script to download models for languages we require
16+
RUN python -m spacy download en
17+
18+
19+
# Copy the code
20+
COPY embedding/ /src/embedding/
21+
22+
#---------------------------
23+
# switch to non root user
24+
# define user/group IDs as ARG
25+
ARG USERID=1000
26+
ARG GROUPID=1000
27+
RUN addgroup --system --gid $GROUPID appuser
28+
RUN adduser --system --uid $USERID --gid $GROUPID appuser
29+
30+
USER appuser
31+
WORKDIR /home/appuser
32+
33+
# Get the NLTK corpus data
34+
RUN python -m nltk.downloader stopwords
35+
36+
ENV EMB_SERVER_PORT 9090
37+
38+
39+
EXPOSE 9090
40+
41+
CMD ["python3", "/src/embedding/server.py"]

src/embedding/__init__.py

Whitespace-only changes.

src/embedding/chat_process.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""SVCLASSIFIER chat worker processes"""
2+
3+
import logging
4+
import numpy
5+
6+
import ai_training.chat_process as ait_c
7+
from spacy_wrapper import SpacyWrapper
8+
from text_classifier_class import EmbeddingComparison
9+
from word2vec_client import Word2VecClient
10+
from svc_config import SvcConfig
11+
import aiohttp
12+
13+
MODEL_FILE = "model.pkl"
14+
15+
16+
def _get_logger():
17+
logger = logging.getLogger('embedding.chat')
18+
return logger
19+
20+
21+
class EmbeddingChatProcessWorker(ait_c.ChatProcessWorkerABC):
22+
23+
__spacy_wrapper = None
24+
25+
def __init__(self, pool, asyncio_loop):
26+
super().__init__(pool, asyncio_loop)
27+
self.chatter = None
28+
self.chat_args = None
29+
self.ai = None
30+
self.is_ready = False
31+
self.logger = _get_logger()
32+
self.w2v_client = Word2VecClient(
33+
SvcConfig.get_instance().w2v_server_url)
34+
35+
async def __aenter__(self):
36+
return self
37+
38+
async def __aexit__(self, exc_type, exc, traceb):
39+
if self.chatter is not None:
40+
self.chatter.__exit__(exc_type, exc, traceb)
41+
42+
async def start_chat(self, msg: ait_c.WakeChatMessage):
43+
"""Handle a wake request"""
44+
45+
async def chat_request(self, msg: ait_c.ChatRequestMessage):
46+
"""Handle a chat request"""
47+
if msg.update_state or self.chatter is None:
48+
self.setup_chat_session()
49+
50+
if self.is_ready:
51+
words = msg.question.split(' ')
52+
list = []
53+
list.append(msg.question)
54+
x_tokens_testset = [
55+
EmbeddingChatProcessWorker.__spacy_wrapper.tokenizeSpacy(s)
56+
for s in list
57+
]
58+
try:
59+
vecs = await self.w2v_client.get_vectors_for_words(words)
60+
except aiohttp.client_exceptions.ClientConnectorError as exc:
61+
self.logger.warn(
62+
"Could not receive response from w2v service - {}".format(
63+
exc))
64+
return ait_c.ChatResponseMessage(msg, None, 0.0)
65+
66+
cls = EmbeddingComparison(w2v=vecs)
67+
cls.load_model(self.ai_path + "/" + MODEL_FILE)
68+
yPred, yProbs = cls.predict(x_tokens_testset)
69+
resp = ait_c.ChatResponseMessage(msg, yPred[0], yProbs[0])
70+
return resp
71+
72+
resp = ait_c.ChatResponseMessage(msg, None, 0.0)
73+
return resp
74+
75+
def setup_chat_session(self):
76+
77+
if EmbeddingChatProcessWorker.__spacy_wrapper is None:
78+
EmbeddingChatProcessWorker.__spacy_wrapper = SpacyWrapper()
79+
80+
self.is_ready = True

0 commit comments

Comments
 (0)