Skip to content
This repository was archived by the owner on Apr 22, 2020. It is now read-only.

Change the default value of similarityCutoff parameter #730

Open
wants to merge 2 commits into
base: 3.4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
public class CosineProc extends SimilarityProc {

@Procedure(name = "algo.similarity.cosine.stream", mode = Mode.READ)
@Description("CALL algo.similarity.cosine.stream([{item:id, weights:[weights]}], {similarityCutoff:-1,degreeCutoff:0}) " +
@Description("CALL algo.similarity.cosine.stream([{item:id, weights:[weights]}], {similarityCutoff:0.1,degreeCutoff:0}) " +
"YIELD item1, item2, count1, count2, intersection, similarity - computes cosine distance")
// todo count1,count2 = could be the non-null values, intersection the values where both are non-null?
public Stream<SimilarityResult> cosineStream(
Expand All @@ -57,7 +57,7 @@ public Stream<SimilarityResult> cosineStream(
}

@Procedure(name = "algo.similarity.cosine", mode = Mode.WRITE)
@Description("CALL algo.similarity.cosine([{item:id, weights:[weights]}], {similarityCutoff:-1,degreeCutoff:0}) " +
@Description("CALL algo.similarity.cosine([{item:id, weights:[weights]}], {similarityCutoff:0.1,degreeCutoff:0}) " +
"YIELD p50, p75, p90, p99, p999, p100 - computes cosine similarities")
public Stream<SimilaritySummaryResult> cosine(
@Name(value = "data", defaultValue = "null") List<Map<String, Object>> data,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public Stream<SimilarityResult> euclideanStream(

WeightedInput[] inputs = prepareWeights(data, getDegreeCutoff(configuration));

double similarityCutoff = getSimilarityCutoff(configuration);
double similarityCutoff = configuration.get("similarityCutoff", -1.0D);
// as we don't compute the sqrt until the end
if (similarityCutoff > 0d) similarityCutoff *= similarityCutoff;

Expand All @@ -69,7 +69,7 @@ public Stream<SimilaritySummaryResult> euclidean(

WeightedInput[] inputs = prepareWeights(data, getDegreeCutoff(configuration));

double similarityCutoff = getSimilarityCutoff(configuration);
double similarityCutoff = configuration.get("similarityCutoff", -1.0D);
// as we don't compute the sqrt until the end
if (similarityCutoff > 0d) similarityCutoff *= similarityCutoff;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
public class JaccardProc extends SimilarityProc {

@Procedure(name = "algo.similarity.jaccard.stream", mode = Mode.READ)
@Description("CALL algo.similarity.jaccard.stream([{item:id, categories:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " +
@Description("CALL algo.similarity.jaccard.stream([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " +
"YIELD item1, item2, count1, count2, intersection, similarity - computes jaccard similarities")
public Stream<SimilarityResult> similarityStream(
@Name(value = "data", defaultValue = "null") List<Map<String,Object>> data,
Expand All @@ -45,7 +45,7 @@ public Stream<SimilarityResult> similarityStream(
}

@Procedure(name = "algo.similarity.jaccard", mode = Mode.WRITE)
@Description("CALL algo.similarity.jaccard([{item:id, categories:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " +
@Description("CALL algo.similarity.jaccard([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " +
"YIELD p50, p75, p90, p99, p999, p100 - computes jaccard similarities")
public Stream<SimilaritySummaryResult> jaccard(
@Name(value = "data", defaultValue = "null") List<Map<String, Object>> data,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
public class OverlapProc extends SimilarityProc {

@Procedure(name = "algo.similarity.overlap.stream", mode = Mode.READ)
@Description("CALL algo.similarity.overlap.stream([{item:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " +
@Description("CALL algo.similarity.overlap.stream([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " +
"YIELD item1, item2, count1, count2, intersection, similarity - computes overlap similarities")
public Stream<SimilarityResult> similarityStream(
@Name(value = "data", defaultValue = "null") List<Map<String,Object>> data,
Expand All @@ -47,7 +47,7 @@ public Stream<SimilarityResult> similarityStream(
}

@Procedure(name = "algo.similarity.overlap", mode = Mode.WRITE)
@Description("CALL algo.similarity.overlap([{item:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " +
@Description("CALL algo.similarity.overlap([{item:id, categories:[ids]}], {similarityCutoff:0.1,degreeCutoff:0}) " +
"YIELD p50, p75, p90, p99, p999, p100 - computes overlap similarities")
public Stream<SimilaritySummaryResult> overlap(
@Name(value = "data", defaultValue = "null") List<Map<String, Object>> data,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Stream<SimilaritySummaryResult> writeAndAggregateResults(ProcedureConfiguration
}

Double getSimilarityCutoff(ProcedureConfiguration configuration) {
return configuration.get("similarityCutoff", -1D);
return configuration.get("similarityCutoff", 0.1D);
}

<T> Stream<SimilarityResult> similarityStream(T[] inputs, SimilarityComputer<T> computer, ProcedureConfiguration configuration, double cutoff, int topK) {
Expand Down
4 changes: 2 additions & 2 deletions doc/asciidoc/similarity-cosine.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ For example, the person most similar to Praveena is Michael, but the person most
| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, weights: [weight, weight, weight]}`
| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds.
| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds.
| `similarityCutoff` | int | -1 | yes | The threshold for cosine similarity. Values below this will not be returned.
| `similarityCutoff` | int | 0.1 | yes | The threshold for cosine similarity. Values below this will not be returned.
| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
| `concurrency` | int | available CPUs | yes | The number of concurrent threads.
|===
Expand Down Expand Up @@ -220,7 +220,7 @@ include::scripts/similarity-cosine.cypher[tag=query]
| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds.
| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds.
| `similarityCutoff` | int | -1 | yes | The threshold for Jaccard similarity. Values below this will not be returned.
| `similarityCutoff` | int | 0.1 | yes | The threshold for Jaccard similarity. Values below this will not be returned.
| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
| `concurrency` | int | available CPUs | yes | The number of concurrent threads.
| `write` | boolean | false | yes | Indicates whether results should be stored.
Expand Down
4 changes: 2 additions & 2 deletions doc/asciidoc/similarity-jaccard.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ For example, the person most similar to Praveena is Zhen, but the person most si
| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds.
| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds.
| `similarityCutoff` | int | -1 | yes | The threshold for Jaccard similarity. Values below this will not be returned.
| `similarityCutoff` | int | 0.1 | yes | The threshold for Jaccard similarity. Values below this will not be returned.
| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
| `concurrency` | int | available CPUs | yes | The number of concurrent threads.
|===
Expand Down Expand Up @@ -217,7 +217,7 @@ include::scripts/similarity-jaccard.cypher[tag=query]
| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds.
| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds.
| `similarityCutoff` | int | -1 | yes | The threshold for Jaccard similarity. Values below this will not be returned.
| `similarityCutoff` | int | 0.1 | yes | The threshold for Jaccard similarity. Values below this will not be returned.
| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
| `concurrency` | int | available CPUs | yes | The number of concurrent threads.
| `write` | boolean | false | yes | Indicates whether results should be stored.
Expand Down
4 changes: 2 additions & 2 deletions doc/asciidoc/similarity-overlap.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ include::scripts/similarity-overlap.cypher[tag=stream-topk]
| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds.
| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds.
| `similarityCutoff` | int | -1 | yes | The threshold for Overlap similarity. Values below this will not be returned.
| `similarityCutoff` | int | 0.1 | yes | The threshold for Overlap similarity. Values below this will not be returned.
| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
| `concurrency` | int | available CPUs | yes | The number of concurrent threads.
|===
Expand Down Expand Up @@ -208,7 +208,7 @@ include::scripts/similarity-overlap.cypher[tag=query]
| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds.
| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds.
| `similarityCutoff` | int | -1 | yes | The threshold for Overlap similarity. Values below this will not be returned.
| `similarityCutoff` | int | 0.1 | yes | The threshold for Overlap similarity. Values below this will not be returned.
| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
| `concurrency` | int | available CPUs | yes | The number of concurrent threads.
| `write` | boolean | false | yes | Indicates whether results should be stored.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,15 @@ public void cosineSingleMultiThreadComparisionTopK() {

@Test
public void topNcosineStreamTest() {
Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2)));
Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2,"similarityCutoff",-1.0)));
assert01(results.next());
assert02(results.next());
assertFalse(results.hasNext());
}

@Test
public void cosineStreamTest() {
Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1)));
Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1,"similarityCutoff",-1.0)));
assertTrue(results.hasNext());
assert01(results.next());
assert02(results.next());
Expand All @@ -208,7 +208,7 @@ public void cosineStreamTest() {

@Test
public void topKCosineStreamTest() {
Map<String, Object> params = map("config", map( "concurrency", 1,"topK", 1));
Map<String, Object> params = map("config", map( "concurrency", 1,"topK", 1, "similarityCutoff", -1.0));
System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());
Result results = db.execute(STATEMENT_STREAM, params);
assertTrue(results.hasNext());
Expand Down Expand Up @@ -253,7 +253,7 @@ public void topK4cosineStreamTest() {

@Test
public void topK3cosineStreamTest() {
Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3));
Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3, "similarityCutoff", -1.0));

System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());

Expand All @@ -267,7 +267,7 @@ public void topK3cosineStreamTest() {

@Test
public void simpleCosineTest() {
Map<String, Object> params = map("config", map());
Map<String, Object> params = map("config", map("similarityCutoff",-1.0));

Map<String, Object> row = db.execute(STATEMENT,params).next();
assertEquals((double) row.get("p25"), 0.0, 0.01);
Expand All @@ -283,7 +283,7 @@ public void simpleCosineTest() {
public void simpleCosineFromEmbeddingTest() {
db.execute(STORE_EMBEDDING_STATEMENT);

Map<String, Object> params = map("config", map());
Map<String, Object> params = map("config", map("similarityCutoff",-1.0));

Map<String, Object> row = db.execute(EMBEDDING_STATEMENT,params).next();
assertEquals((double) row.get("p25"), 0.0, 0.01);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,15 @@ public void jaccardSingleMultiThreadComparisionTopK() {

@Test
public void topNjaccardStreamTest() {
Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2)));
Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2,"similarityCutoff",-1.0)));
assert01(results.next());
assert02(results.next());
assertFalse(results.hasNext());
}

@Test
public void jaccardStreamTest() {
Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1)));
Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1,"similarityCutoff",-1.0)));
assertTrue(results.hasNext());
assert01(results.next());
assert02(results.next());
Expand All @@ -182,7 +182,7 @@ public void jaccardStreamTest() {

@Test
public void topKJaccardStreamTest() {
Map<String, Object> params = map("config", map( "concurrency", 1,"topK", 1));
Map<String, Object> params = map("config", map( "concurrency", 1,"topK", 1,"similarityCutoff",-1.0));
System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());

Result results = db.execute(STATEMENT_STREAM, params);
Expand Down Expand Up @@ -227,7 +227,7 @@ public void topK4jaccardStreamTest() {

@Test
public void topK3jaccardStreamTest() {
Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3));
Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3, "similarityCutoff",-1.0));

System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ public void topNoverlapStreamTest() {

@Test
public void overlapStreamTest() {
Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1)));
Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1,"similarityCutoff",-1.0)));

assertTrue(results.hasNext());
assert01(results.next());
Expand Down Expand Up @@ -229,7 +229,7 @@ public void topK4overlapStreamTest() {

@Test
public void topK3overlapStreamTest() {
Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3));
Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3, "similarityCutoff", -1.0));

System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());

Expand Down