Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
**
!common
!complaints
!config_sample.ini
!Makefile
!requirements.txt
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.11.11
python-version: 3.11

- name: Install Python dependencies
run: |
Expand Down
27 changes: 27 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM python:3.13-alpine

ENV LANG=en_US.UTF-8
ENV PIP_NO_CACHE_DIR=true
ENV PYTHONUNBUFFERED=1
ENV HOME=/usr/home

WORKDIR ${HOME}

COPY . .

RUN apk update --no-cache && \
apk upgrade --no-cache --ignore alpine-baselayout && \
apk add --no-cache \
aws-cli \
jq \
make

RUN pip install --upgrade pip setuptools && \
pip install -r ./requirements.txt

# Don't run as the root user.
ARG USER=base
RUN adduser -g ${USER} --disabled-password ${USER}
RUN chown -R ${USER}:${USER} ${HOME}

USER ${USER}
45 changes: 29 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ OS_NAME := $(shell uname -s | tr A-Z a-z)
# "SOCRATA_JSON" = {metadata: {}, data: [[], [], []]}
# ND-JSON = {}\n{}\n{}\n

DIRS := complaints/ccdb/intake complaints/ccdb/ready_es complaints/ccdb/ready_s3
STATE_DIR := complaints/ccdb
DIRS := $(STATE_DIR)/intake $(STATE_DIR)/ready_es $(STATE_DIR)/ready_s3
MAX_RECORDS ?= 0

# Aliases
Expand All @@ -15,25 +16,25 @@ ALIAS := complaint-public-$(ENV)

CONFIG_CCDB := config-ccdb.ini

DATASET_CSV := complaints/ccdb/intake/complaints.csv
DATASET_ND_JSON := complaints/ccdb/ready_es/complaints.json
DATASET_PUBLIC_CSV := complaints/ccdb/ready_s3/complaints.csv
DATASET_PUBLIC_JSON := complaints/ccdb/ready_s3/complaints.json
DATASET_CSV := $(STATE_DIR)/intake/complaints.csv
DATASET_ND_JSON := $(STATE_DIR)/ready_es/complaints.json
DATASET_PUBLIC_CSV := $(STATE_DIR)/ready_s3/complaints.csv
DATASET_PUBLIC_JSON := $(STATE_DIR)/ready_s3/complaints.json

METADATA_JAVASCRIPT := complaints/ccdb/ready_s3/metadata.js
METADATA_JSON := complaints/ccdb/intake/complaints_metadata.json
METADATA_PUBLIC_JSON := complaints/ccdb/ready_s3/complaints_metadata.json
METADATA_JAVASCRIPT := $(STATE_DIR)/ready_s3/metadata.js
METADATA_JSON := $(STATE_DIR)/intake/complaints_metadata.json
METADATA_PUBLIC_JSON := $(STATE_DIR)/ready_s3/complaints_metadata.json

# Field Names

FIELDS_S3_CSV := complaints/ccdb/intake/fields-s3-csv.txt
FIELDS_S3_JSON := complaints/ccdb/intake/fields-s3-json.txt
FIELDS_S3_CSV := $(STATE_DIR)/intake/fields-s3-csv.txt
FIELDS_S3_JSON := $(STATE_DIR)/intake/fields-s3-json.txt

# Sentinels

INDEX_CCDB := complaints/ccdb/ready_es/.last_indexed
INPUT_S3_TIMESTAMP := complaints/ccdb/intake/.latest_dataset
PUSH_S3 := complaints/ccdb/ready_s3/.last_pushed
INDEX_CCDB := $(STATE_DIR)/ready_es/.last_indexed
INPUT_S3_TIMESTAMP := $(STATE_DIR)/intake/.latest_dataset
PUSH_S3 := $(STATE_DIR)/ready_s3/.last_pushed

# URLs

Expand All @@ -42,8 +43,8 @@ URL_PUBLIC_METADATA ?= https://files.consumerfinance.gov/ccdb/complaints_metadat

# Verification

S3_JSON_COUNT := complaints/ccdb/verification/json_prev_size.txt
AKAMAI_CACHE_COUNT := complaints/ccdb/verification/cache_prev_size.txt
S3_JSON_COUNT := $(STATE_DIR)/verification/json_prev_size.txt
AKAMAI_CACHE_COUNT := $(STATE_DIR)/verification/cache_prev_size.txt

# Defaults

Expand Down Expand Up @@ -82,7 +83,19 @@ clean:
dirs:
for dir in $(DIRS) ; do [ -d $$dir ] || mkdir -p $$dir ; done

elasticsearch: dirs check_latest $(INDEX_CCDB)
elasticsearch: dirs check_latest
@get_timestamp() { \
ls -l "$$1" 2>/dev/null | awk '{print $$6, $$7, $$8}' || echo 'none'; \
}; \
echo "S3 dataset timestamp: $$(get_timestamp $(INPUT_S3_TIMESTAMP))"; \
echo "Last index timestamp: $$(get_timestamp $(INDEX_CCDB))"
@if [ -n "$(FORCE_REINDEX)" ] || \
[ ! -f $(INDEX_CCDB) ] || \
[ $(INPUT_S3_TIMESTAMP) -nt $(INDEX_CCDB) ]; then \
$(MAKE) $(INDEX_CCDB); \
else \
echo "Data already indexed, nothing to do"; \
fi


from_public: dirs
Expand Down
15 changes: 12 additions & 3 deletions common/es_proxy.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from urllib.parse import quote

from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

Expand Down Expand Up @@ -26,10 +28,17 @@ def add_basic_es_arguments(parser):


def get_es_connection(config):
url = "{}://{}:{}".format("http", config.es_host, config.es_port)
host = config.es_host

if config.es_username and config.es_password:
encoded_username = quote(config.es_username)
encoded_password = quote(config.es_password)
host = f'{encoded_username}:{encoded_password}@{host}'

es = Elasticsearch(
url, http_auth=(config.es_username, config.es_password),
user_ssl=True, timeout=2000
f'http://{host}:{config.es_port}',
use_ssl=(str(config.es_port) == '443'),
timeout=2000
)
return es

Expand Down
22 changes: 19 additions & 3 deletions common/tests/test_es_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,23 @@ def test_get_es_connection(self, mock_es):
actual = sut.get_es_connection(options)
self.assertEqual(actual, 'passed')
mock_es.assert_called_once_with(
'http://www.example.org:9222',
http_auth=('king', 'kong'),
'http://king:kong@www.example.org:9222',
timeout=2000,
user_ssl=True)
use_ssl=False)

@patch('common.es_proxy.Elasticsearch')
def test_get_es_connection_ssl(self, mock_es):
options = make_configargs({
'es_host': 'www.example.org',
'es_port': '443',
'es_username': 'king!',
'es_password': 'kong',
})
mock_es.return_value = 'passed'

actual = sut.get_es_connection(options)
self.assertEqual(actual, 'passed')
mock_es.assert_called_once_with(
'http://king%21:kong@www.example.org:443',
timeout=2000,
use_ssl=True)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
boto3==1.11.7
boto3==1.37.38
ConfigArgParse==0.14.0
elasticsearch==7.13.4
ijson==2.4
Expand Down