Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More similarity metrics #1396

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions core/src/main/java/de/jplag/JPlagResult.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.jplag;

import java.util.Arrays;
import java.util.List;
import java.util.function.ToDoubleFunction;

Expand All @@ -11,6 +12,7 @@
* Encapsulates the results of a comparison of a set of source code submissions.
*/
public class JPlagResult {
private final static int SIMILARITY_DISTRIBUTION_SIZE = 100;

private List<JPlagComparison> comparisons; // comparisons whose similarity was about the specified threshold

Expand All @@ -23,7 +25,6 @@ public class JPlagResult {
private final int[] similarityDistribution; // 10-element array representing the similarity distribution of the detected matches.

private List<ClusteringResult<Submission>> clusteringResult;
private final int SIMILARITY_DISTRIBUTION_SIZE = 100;

public JPlagResult(List<JPlagComparison> comparisons, SubmissionSet submissions, long durationInMillis, JPlagOptions options) {
// sort by similarity (descending)
Expand All @@ -34,15 +35,6 @@ public JPlagResult(List<JPlagComparison> comparisons, SubmissionSet submissions,
similarityDistribution = calculateSimilarityDistribution(comparisons);
}

/**
* Drops elements from the comparison list to free memory. Note, that this affects the similarity distribution and is
* only meant to be used if you don't need the information about comparisons with lower match similarity anymore.
* @param limit the number of comparisons to keep in the list
*/
public void dropComparisons(int limit) {
this.comparisons = this.getComparisons(limit);
}

public void setClusteringResult(List<ClusteringResult<Submission>> clustering) {
this.clusteringResult = clustering;
}
Expand Down Expand Up @@ -127,6 +119,10 @@ public String toString() {
getDuration(), getOptions().language().getName(), submissions.numberOfSubmissions());
}

public List<Integer> calculateDistributionFor(ToDoubleFunction<JPlagComparison> similarityMetric) {
return Arrays.stream(calculateDistributionFor(this.comparisons, similarityMetric)).boxed().toList();
}

/**
* Note: Before, comparisons with a similarity below the given threshold were also included in the similarity matrix.
*/
Expand Down
23 changes: 22 additions & 1 deletion core/src/main/java/de/jplag/options/SimilarityMetric.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,27 @@
package de.jplag.options;

import java.util.HashMap;
import java.util.Map;
import java.util.function.ToDoubleFunction;

import de.jplag.JPlagComparison;
import de.jplag.Match;

public enum SimilarityMetric implements ToDoubleFunction<JPlagComparison> {
AVG("average similarity", JPlagComparison::similarity),
MIN("minimum similarity", JPlagComparison::minimalSimilarity),
MAX("maximal similarity", JPlagComparison::maximalSimilarity),
INTERSECTION("matched tokens", it -> (double) it.getNumberOfMatchedTokens());
INTERSECTION("matched tokens", it -> (double) it.getNumberOfMatchedTokens()),
SYMMETRIC("symmetric similarity", it -> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this equivalent to the implementation of the "average similarity", only ignoring base code?

int divisor = it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens();
if (divisor != 0) {
return 2.0 * it.getNumberOfMatchedTokens() / divisor;
} else {
return .0;
}
}),
LONGEST_MATCH("number of tokens in the longest match", it -> it.matches().stream().mapToInt(Match::length).max().orElse(0)),
OVERALL("Sum of both submission lengths", it -> it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens());

private final ToDoubleFunction<JPlagComparison> similarityFunction;
private final String description;
Expand All @@ -31,4 +44,12 @@ public double applyAsDouble(JPlagComparison comparison) {
public String toString() {
return description;
}

public static Map<String, Double> createSimilarityMap(JPlagComparison comparison) {
Map<String, Double> result = new HashMap<>();
for (SimilarityMetric metric : SimilarityMetric.values()) {
result.put(metric.name(), metric.applyAsDouble(comparison));
}
return result;
}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package de.jplag.reporting.jsonfactory;

import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
Expand Down Expand Up @@ -56,8 +53,7 @@ private void writeComparisons(String path, List<JPlagComparison> comparisons) {
String secondSubmissionId = submissionToIdFunction.apply(comparison.secondSubmission());
String fileName = generateComparisonName(firstSubmissionId, secondSubmissionId);
addToLookUp(firstSubmissionId, secondSubmissionId, fileName);
var comparisonReport = new ComparisonReport(firstSubmissionId, secondSubmissionId,
Map.of(SimilarityMetric.AVG.name(), comparison.similarity(), SimilarityMetric.MAX.name(), comparison.maximalSimilarity()),
var comparisonReport = new ComparisonReport(firstSubmissionId, secondSubmissionId, SimilarityMetric.createSimilarityMap(comparison),
convertMatchesToReportMatches(comparison));
fileWriter.saveAsJSON(comparisonReport, path, fileName);
});
Expand Down Expand Up @@ -98,7 +94,7 @@ private Match convertMatchToReportMatch(JPlagComparison comparison, de.jplag.Mat
List<Token> tokensFirst = comparison.firstSubmission().getTokenList().subList(match.startOfFirst(), match.endOfFirst() + 1);
List<Token> tokensSecond = comparison.secondSubmission().getTokenList().subList(match.startOfSecond(), match.endOfSecond() + 1);

Comparator<? super Token> lineComparator = (first, second) -> first.getLine() - second.getLine();
Comparator<? super Token> lineComparator = Comparator.comparingInt(Token::getLine);

Token startOfFirst = tokensFirst.stream().min(lineComparator).orElseThrow();
Token endOfFirst = tokensFirst.stream().max(lineComparator).orElseThrow();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
package de.jplag.reporting.reportobject.mapper;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

import de.jplag.JPlagComparison;
import de.jplag.JPlagResult;
import de.jplag.Submission;
import de.jplag.options.SimilarityMetric;
Expand All @@ -29,8 +26,11 @@ public MetricMapper(Function<Submission, String> submissionToIdFunction) {
* @return Map with key as name of metric and value as distribution
*/
public static Map<String, List<Integer>> getDistributions(JPlagResult result) {
return Map.of(SimilarityMetric.AVG.name(), convertDistribution(result.getSimilarityDistribution()), SimilarityMetric.MAX.name(),
convertDistribution(result.getMaxSimilarityDistribution()));
Map<String, List<Integer>> distributions = new HashMap<>();
for (SimilarityMetric metric : SimilarityMetric.values()) {
distributions.put(metric.name(), result.calculateDistributionFor(metric));
}
return distributions;
}

/**
Expand All @@ -41,17 +41,7 @@ public static Map<String, List<Integer>> getDistributions(JPlagResult result) {
public List<TopComparison> getTopComparisons(JPlagResult result) {
return result.getComparisons(result.getOptions().maximumNumberOfComparisons()).stream()
.map(comparison -> new TopComparison(submissionToIdFunction.apply(comparison.firstSubmission()),
submissionToIdFunction.apply(comparison.secondSubmission()), getComparisonMetricMap(comparison)))
submissionToIdFunction.apply(comparison.secondSubmission()), SimilarityMetric.createSimilarityMap(comparison)))
.toList();
}

private Map<String, Double> getComparisonMetricMap(JPlagComparison comparison) {
return Map.of(SimilarityMetric.AVG.name(), comparison.similarity(), SimilarityMetric.MAX.name(), comparison.maximalSimilarity());
}

private static List<Integer> convertDistribution(int[] array) {
List<Integer> list = new ArrayList<>(Arrays.stream(array).boxed().toList());
Collections.reverse(list);
return list;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import static org.mockito.Mockito.mock;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

Expand All @@ -16,6 +17,7 @@
import de.jplag.JPlagResult;
import de.jplag.Submission;
import de.jplag.options.JPlagOptions;
import de.jplag.options.SimilarityMetric;
import de.jplag.reporting.reportobject.model.TopComparison;

public class MetricMapperTest {
Expand All @@ -39,7 +41,8 @@ public void test_getDistributions() {
Map<String, List<Integer>> result = MetricMapper.getDistributions(jPlagResult);

// then
Assertions.assertEquals(Map.of("AVG", EXPECTED_AVG_DISTRIBUTION, "MAX", EXPECTED_MAX_DISTRIBUTION), result);
Assertions.assertEquals(EXPECTED_AVG_DISTRIBUTION, result.get("AVG"));
Assertions.assertEquals(EXPECTED_MAX_DISTRIBUTION, result.get("MAX"));
}

@Test
Expand All @@ -53,13 +56,11 @@ public void test_getTopComparisons() {

// then
Assertions.assertEquals(
List.of(new TopComparison("1", "2", Map.of("AVG", .7, "MAX", .8)), new TopComparison("3", "4", Map.of("AVG", .3, "MAX", .9))),
result);
List.of(new TopComparison("1", "2", buildSimilarityMap(.7, .8)), new TopComparison("3", "4", buildSimilarityMap(.3, .9))), result);
}

private int[] distribution(List<Integer> expectedDistribution) {
var reversedDistribution = new ArrayList<>(expectedDistribution);
Collections.reverse(reversedDistribution);
return reversedDistribution.stream().mapToInt(Integer::intValue).toArray();
}

Expand All @@ -76,6 +77,9 @@ private JPlagResult createJPlagResult(int[] avgDistribution, int[] maxDistributi
doReturn(avgDistribution).when(jPlagResult).getSimilarityDistribution();
doReturn(maxDistribution).when(jPlagResult).getMaxSimilarityDistribution();

doReturn(Arrays.stream(avgDistribution).boxed().toList()).when(jPlagResult).calculateDistributionFor(SimilarityMetric.AVG);
doReturn(Arrays.stream(maxDistribution).boxed().toList()).when(jPlagResult).calculateDistributionFor(SimilarityMetric.MAX);

JPlagOptions options = mock(JPlagOptions.class);
doReturn(createComparisonsDto.length).when(options).maximumNumberOfComparisons();
doReturn(options).when(jPlagResult).getOptions();
Expand Down Expand Up @@ -105,4 +109,14 @@ private record Comparison(CreateSubmission submission1, CreateSubmission submiss
private record CreateSubmission(String name) {
}

private Map<String, Double> buildSimilarityMap(double avg, double max) {
Map<String, Double> map = new HashMap<>();
for (SimilarityMetric value : SimilarityMetric.values()) {
map.put(value.name(), 0d);
}
map.put(SimilarityMetric.AVG.name(), avg);
map.put(SimilarityMetric.MAX.name(), max);
return map;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public double getSimilarityForMetric(SimilarityMetric metric) {
case MIN -> resultSimilarityMinimum();
case MAX -> resultSimilarityMaximum();
case INTERSECTION -> resultMatchedTokenNumber();
default -> throw new IllegalArgumentException(String.format("Similarity metric %s not supported for end to end tests", metric.name()));
};
}

Expand Down
Loading