Skip to content

Commit

Permalink
Fix very large table ingestion, fixes alephdata#764
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Nov 3, 2019
1 parent 0541e82 commit b826b68
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 8 deletions.
1 change: 0 additions & 1 deletion aleph/index/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def entities_by_ids(ids, schemata=None, cached=False,
return
index = entities_read_index(schema=schemata)
query = {'ids': {'values': ids}}
# query = {'bool': {'filter': query}}
query = {
'query': query,
'_source': _source_spec(includes, excludes),
Expand Down
2 changes: 1 addition & 1 deletion aleph/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def handle(self, task):
if stage.stage == OP_BULKLOAD:
bulk_load(stage, collection, payload)
if stage.stage == OP_PROCESS:
if payload.get('reset'):
if payload.pop('reset', False):
reset_collection(collection, sync=True)
process_collection(stage, collection, sync=sync, **payload)
if stage.stage == OP_XREF:
Expand Down
2 changes: 1 addition & 1 deletion requirements-toolkit.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Dependencies maintained by OCCRP
banal==0.4.2
urlnormalizer==1.2.5
followthemoney==1.21.5
followthemoney==1.22.0
fingerprints==0.6.6
servicelayer[google,amazon]==1.9.0
normality==2.0.0
Expand Down
4 changes: 3 additions & 1 deletion services/ingest-file/ingestors/support/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ def emit_row_dicts(self, table, rows, headers=None):
csv_writer.writerow(values)
self.manager.emit_text_fragment(table, values, row_count)
row_count += 1
if row_count > 0 and row_count % 1000 == 0:
log.info("Table emit [%s]: %s...", table, row_count)
if row_count > 0:
csv_hash = self.manager.store(csv_path, mime_type=CSV)
table.set('csvHash', csv_hash)
table.set('rowCount', row_count + 1)
table.set('rowCount', row_count)
table.set('columns', registry.json.pack(headers))

def wrap_row_tuples(self, rows):
Expand Down
2 changes: 1 addition & 1 deletion services/ingest-file/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ normality==2.0.0
pantomime==0.3.3
servicelayer[google,amazon]==1.9.0
balkhash[leveldb,sql]==1.1.1
followthemoney==1.21.5
followthemoney==1.22.0
languagecodes==1.0.5
psycopg2-binary==2.8.4
pyicu==2.3.1
Expand Down
2 changes: 1 addition & 1 deletion ui/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "3.3.4",
"private": true,
"dependencies": {
"@alephdata/followthemoney": "^1.21.5",
"@alephdata/followthemoney": "^1.22.0",
"@blueprintjs/core": "^3.18.1",
"@blueprintjs/icons": "3.11.0",
"@blueprintjs/select": "^3.8.0",
Expand Down
9 changes: 7 additions & 2 deletions ui/src/viewers/CsvStreamViewer.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,20 @@ class CSVStreamViewer extends React.Component {

render() {
const { document } = this.props;
const { rows } = this.state;
if (document.id === undefined) {
return null;
}
const numRows = parseInt(document.getFirst('rowCount'), 10);
const columnsJson = document.getFirst('columns');
const columns = columnsJson ? JSON.parse(columnsJson) : [];
const columnsFtm = columnsJson ? JSON.parse(columnsJson) : [];
// HACK: Use the first row of the data as headers if nothing is in the
// FtM metadata.
const columns = columnsFtm.length || (rows.length > 0 ? rows[0] : []);
return (
<div className="TableViewer">
<Table
numRows={document.getFirst('rowCount')}
numRows={numRows}
enableGhostCells
enableRowHeader
onVisibleCellsChange={this.onVisibleCellsChange}
Expand Down

0 comments on commit b826b68

Please sign in to comment.