Skip to content

Commit 82d5991

Browse files
Primary keys validation
1 parent 87bd4dd commit 82d5991

File tree

13 files changed

+295
-4
lines changed

13 files changed

+295
-4
lines changed

pom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
<junit.version>5.13.2</junit.version>
4141
<slf4j-simple.version>2.0.17</slf4j-simple.version>
4242
<apache-commons-collections4.version>4.5.0</apache-commons-collections4.version>
43+
<externalsortinginjava.version>0.6.2</externalsortinginjava.version>
4344
<maven-compiler-plugin.version>3.14.0</maven-compiler-plugin.version>
4445
<maven-dependency-plugin.version>3.8.1</maven-dependency-plugin.version>
4546
<maven-source-plugin.version>3.3.1</maven-source-plugin.version>
@@ -206,6 +207,13 @@
206207
<version>${tableschema-java-version}</version>
207208
</dependency>
208209

210+
<!-- Sorting -->
211+
<dependency>
212+
<groupId>com.google.code.externalsortinginjava</groupId>
213+
<artifactId>externalsortinginjava</artifactId>
214+
<version>${externalsortinginjava.version}</version>
215+
</dependency>
216+
209217
<!-- Unit Testing -->
210218
<dependency>
211219
<groupId>org.junit.jupiter</groupId>

src/main/java/io/frictionlessdata/datapackage/resource/AbstractResource.java

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import com.fasterxml.jackson.annotation.JsonProperty;
77
import com.fasterxml.jackson.core.JsonProcessingException;
88
import com.fasterxml.jackson.databind.ObjectMapper;
9+
import com.google.code.externalsorting.ExternalSort;
910
import io.frictionlessdata.datapackage.Dialect;
1011
import io.frictionlessdata.datapackage.JSONBase;
1112
import io.frictionlessdata.datapackage.Package;
@@ -14,10 +15,7 @@
1415
import io.frictionlessdata.datapackage.exceptions.DataPackageValidationException;
1516
import io.frictionlessdata.datapackage.fk.PackageForeignKey;
1617
import io.frictionlessdata.tableschema.Table;
17-
import io.frictionlessdata.tableschema.exception.ForeignKeyException;
18-
import io.frictionlessdata.tableschema.exception.JsonSerializingException;
19-
import io.frictionlessdata.tableschema.exception.TableIOException;
20-
import io.frictionlessdata.tableschema.exception.TypeInferringException;
18+
import io.frictionlessdata.tableschema.exception.*;
2119
import io.frictionlessdata.tableschema.field.Field;
2220
import io.frictionlessdata.tableschema.fk.ForeignKey;
2321
import io.frictionlessdata.tableschema.io.FileReference;
@@ -40,6 +38,7 @@
4038
import java.nio.file.Files;
4139
import java.nio.file.Path;
4240
import java.util.*;
41+
import java.util.stream.Collectors;
4342

4443
/**
4544
* Abstract base implementation of a Resource.
@@ -374,6 +373,7 @@ public List<Table> getTables() throws Exception {
374373
return tables;
375374
}
376375

376+
@Override
377377
public void checkRelations(Package pkg) {
378378
if (null != schema) {
379379
List<PackageForeignKey> fks = new ArrayList<>();
@@ -445,6 +445,71 @@ public void checkRelations(Package pkg) {
445445
}
446446
}
447447

448+
@Override
449+
public void checkPrimaryKeys() {
450+
if (null != schema) {
451+
Object pkObj = schema.getPrimaryKey();
452+
if (pkObj == null) {
453+
return; // no primary key defined
454+
}
455+
456+
// Normalize PK fields
457+
String[] pkFields;
458+
if (pkObj instanceof String) {
459+
pkFields = new String[]{(String) pkObj};
460+
} else if (pkObj instanceof String[]) {
461+
pkFields = (String[]) pkObj;
462+
} else {
463+
throw new PrimaryKeyException("Unsupported primary key type: " + pkObj.getClass());
464+
}
465+
466+
try {
467+
// Dump all keys to a temporary file
468+
Path tempFile = Files.createTempFile("pk-check", ".txt");
469+
try (BufferedWriter writer = Files.newBufferedWriter(tempFile)) {
470+
List<Object> data = this.getData(true, false, true, false);
471+
for (Object d : data) {
472+
Map<String, Object> row = (Map<String, Object>) d;
473+
String key = Arrays.stream(pkFields)
474+
.map(f -> String.valueOf(row.get(f)))
475+
.collect(Collectors.joining("\t"));
476+
writer.write(key);
477+
writer.newLine();
478+
}
479+
}
480+
481+
// Use ExternalSort to sort the file
482+
File inputFile = tempFile.toFile();
483+
File sortedFile = Files.createTempFile("pk-check-sorted", ".txt").toFile();
484+
485+
List<File> tempChunks = ExternalSort.sortInBatch(inputFile);
486+
ExternalSort.mergeSortedFiles(tempChunks, sortedFile);
487+
488+
// Scan sorted file line-by-line for duplicates
489+
try (BufferedReader reader = new BufferedReader(new FileReader(sortedFile))) {
490+
String prev = null;
491+
String line;
492+
while ((line = reader.readLine()) != null) {
493+
if (line.equals(prev)) {
494+
throw new PrimaryKeyException(
495+
"Primary key violation in resource '" + this.getName() +
496+
"': duplicate key " + line
497+
);
498+
}
499+
prev = line;
500+
}
501+
}
502+
503+
// Cleanup
504+
Files.deleteIfExists(tempFile);
505+
Files.deleteIfExists(sortedFile.toPath());
506+
507+
} catch (Exception e) {
508+
throw new PrimaryKeyException("Error validating primary keys: " + e.getMessage());
509+
}
510+
}
511+
}
512+
448513
public void validate(Package pkg) {
449514

450515
try {

src/main/java/io/frictionlessdata/datapackage/resource/Resource.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ static ResourceBuilder builder(String resourceName) {
376376

377377
void checkRelations(Package pkg) throws Exception;
378378

379+
void checkPrimaryKeys() throws Exception;
380+
379381
/**
380382
* Recreate a Resource object from a JSON descriptor, a base path to resolve relative file paths against
381383
* and a flag that tells us whether we are reading from inside a ZIP archive.
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.frictionlessdata.datapackage;
15+
16+
import io.frictionlessdata.datapackage.resource.Resource;
17+
import io.frictionlessdata.tableschema.exception.PrimaryKeyException;
18+
import org.junit.jupiter.api.DisplayName;
19+
import org.junit.jupiter.api.Test;
20+
21+
import java.nio.file.Path;
22+
23+
import static org.junit.jupiter.api.Assertions.*;
24+
import static org.junit.jupiter.api.Assertions.assertEquals;
25+
26+
public class PrimaryKeysTest {
27+
28+
@Test
29+
@DisplayName("Test the uniqueness of simple primary keys - invalid case")
30+
void testPrimaryKeysUniqueInvalid() throws Exception {
31+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json");
32+
Package pkg = new Package(resourcePath, true);
33+
Resource teams = pkg.getResource("teams");
34+
35+
Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys);
36+
assertInstanceOf(PrimaryKeyException.class, ex);
37+
assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key 1", ex.getMessage());
38+
}
39+
40+
@Test
41+
@DisplayName("Test the uniqueness of simple primary keys - valid case")
42+
void testPrimaryKeysUniqueValid() throws Exception {
43+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json");
44+
Package pkg = new Package(resourcePath, true);
45+
Resource teams = pkg.getResource("teams");
46+
47+
assertDoesNotThrow(teams::checkPrimaryKeys);
48+
}
49+
50+
@Test
51+
@DisplayName("Test the uniqueness of composite primary keys - invalid case")
52+
void testCompositePrimaryKeysUniqueInvalid() throws Exception {
53+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json");
54+
Package pkg = new Package(resourcePath, true);
55+
Resource teams = pkg.getResource("teams");
56+
57+
Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys);
58+
assertInstanceOf(PrimaryKeyException.class, ex);
59+
assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key UK\tLondon", ex.getMessage());
60+
}
61+
62+
@Test
63+
@DisplayName("Test the uniqueness of composite primary keys - valid case")
64+
void testCompositePrimaryKeysUniqueValid() throws Exception {
65+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json");
66+
Package pkg = new Package(resourcePath, true);
67+
Resource teams = pkg.getResource("teams");
68+
69+
assertDoesNotThrow(teams::checkPrimaryKeys);
70+
}
71+
}

src/test/java/io/frictionlessdata/datapackage/resource/NonTabularResourceTest.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,10 @@ public String getSerializationFormat() {
753753
public void checkRelations(Package aPackage) throws Exception {
754754
}
755755

756+
@Override
757+
public void checkPrimaryKeys() throws Exception {
758+
}
759+
756760
@Override
757761
public void validate(Package aPackage) {
758762
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"name": "foreign-keys",
3+
"resources": [
4+
{
5+
"name": "teams",
6+
"profile": "tabular-data-resource",
7+
"encoding": "UTF-8",
8+
"format": "csv",
9+
"schema": {
10+
"fields": [
11+
{
12+
"name": "name",
13+
"type": "string"
14+
},
15+
{
16+
"name": "country",
17+
"type": "string"
18+
},
19+
{
20+
"name": "city",
21+
"type": "string"
22+
}
23+
],
24+
"primaryKey": ["country", "city"]
25+
},
26+
"path": "teams.csv"
27+
}
28+
]
29+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"name": "foreign-keys",
3+
"resources": [
4+
{
5+
"name": "teams",
6+
"profile": "tabular-data-resource",
7+
"encoding": "UTF-8",
8+
"format": "csv",
9+
"schema": {
10+
"fields": [
11+
{
12+
"name": "name",
13+
"type": "string"
14+
},
15+
{
16+
"name": "country",
17+
"type": "string"
18+
},
19+
{
20+
"name": "city",
21+
"type": "string"
22+
}
23+
],
24+
"primaryKey": ["country", "city"]
25+
},
26+
"path": "teams-valid.csv"
27+
}
28+
]
29+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name,country,city
2+
Arsenal,UK,London
3+
Real,Spain,Madrid
4+
Bayern,Germany,Munich
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
name,country,city
2+
Arsenal,UK,London
3+
Real,Spain,Madrid
4+
Bayern,Germany,Munich
5+
Chelsea,UK,London
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"name": "foreign-keys",
3+
"resources": [
4+
{
5+
"name": "teams",
6+
"profile": "tabular-data-resource",
7+
"encoding": "UTF-8",
8+
"format": "csv",
9+
"schema": {
10+
"fields": [
11+
{
12+
"name": "id",
13+
"type": "integer",
14+
"constraints": {
15+
"required": true,
16+
"unique": true
17+
}
18+
},
19+
{
20+
"name": "name",
21+
"type": "string"
22+
},
23+
{
24+
"name": "city",
25+
"type": "string"
26+
}
27+
],
28+
"primaryKey": "id"
29+
},
30+
"path": "teams.csv"
31+
}
32+
]
33+
}

0 commit comments

Comments
 (0)