diff --git a/src/main/java/edu/anadolu/datasets/DataSet.java b/src/main/java/edu/anadolu/datasets/DataSet.java index 1473189..349382d 100644 --- a/src/main/java/edu/anadolu/datasets/DataSet.java +++ b/src/main/java/edu/anadolu/datasets/DataSet.java @@ -3,6 +3,8 @@ import org.clueweb09.InfoNeed; import org.clueweb09.tracks.Track; +import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -48,7 +50,7 @@ public Path home() { } public Path indexesPath() { - return Paths.get(tfd_home, collection.toString(), "indexes"); + return Paths.get("/indexes/TFD-HOME", collection.toString(), "indexes"); } diff --git a/src/main/java/edu/anadolu/ltr/DocFeatureBase.java b/src/main/java/edu/anadolu/ltr/DocFeatureBase.java index be57bb7..8cff9c3 100644 --- a/src/main/java/edu/anadolu/ltr/DocFeatureBase.java +++ b/src/main/java/edu/anadolu/ltr/DocFeatureBase.java @@ -53,6 +53,11 @@ public class DocFeatureBase { List keyword; List description; List hTags; + float[] vectorlistContent; + float[] vectortitle; + float[] vectorkeyword; + float[] vectordescription; + float[] vectorhTags; IndexSearcher searcher; IndexReader reader; Map mapTf; @@ -85,6 +90,13 @@ public class DocFeatureBase { .map(e -> e.text()) .map(String::trim) .filter(notEmpty).collect(Collectors.toList()); + if(bert!=null){ + vectorlistContent = bertVector(this.listContent); + vectortitle = bertVector(this.title); + vectorkeyword = bertVector(this.keyword); + vectordescription = bertVector(this.description); + vectorhTags = bertVector(this.hTags); + } } catch (Exception exception) { System.err.println("jdoc exception " + warcRecord.id()); exception.printStackTrace(); @@ -273,14 +285,16 @@ protected double cosSim(String str1, String str2){ return score; } - protected double bertSim(String str1, String str2){ - if(str1.length()==0 || str2.length()==0) return 0; - float[][] embeddings = bert.embedSequences(str1,str2); + protected float[] bertVector(List str){ + if(str.size()==0) return null; + return bert.embedSequence(String.join(" ",str)); + } + + protected double bertSim(float[] vectorA, float[] vectorB){ + if(vectorA==null || vectorB==null) return 0; double dotProduct = 0.0; double normA = 0.0; double normB = 0.0; - float[] vectorA = embeddings[0]; - float[] vectorB = embeddings[1]; for (int i = 0; i < vectorA.length; i++) { dotProduct += vectorA[i] * vectorB[i]; normA += Math.pow(vectorA[i], 2); diff --git a/src/main/java/edu/anadolu/ltr/SEOTool.java b/src/main/java/edu/anadolu/ltr/SEOTool.java index ff5cc64..a35e7b4 100644 --- a/src/main/java/edu/anadolu/ltr/SEOTool.java +++ b/src/main/java/edu/anadolu/ltr/SEOTool.java @@ -1,5 +1,6 @@ package edu.anadolu.ltr; +import com.robrua.nlp.bert.Bert; import edu.anadolu.Indexer; import edu.anadolu.analysis.Analyzers; import edu.anadolu.analysis.Tag; @@ -149,31 +150,35 @@ public void run(Properties props) throws Exception { ///////////////////////////// Index Reading for stats /////////////////////////////////////////// Path indexPath=null; - if(this.tag == null) - indexPath = Files.newDirectoryStream(dataset.indexesPath(), Files::isDirectory).iterator().next(); - else { - try (DirectoryStream stream = Files.newDirectoryStream(dataset.indexesPath(), Files::isDirectory)) { - for (Path path : stream) { - if(!tag.equals(path.getFileName().toString())) continue; - indexPath = path; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - if(indexPath == null) - throw new RuntimeException(tag + " index not found"); - - - this.indexTag = indexPath.getFileName().toString(); - this.analyzerTag = Tag.tag(indexTag); - - this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); - - IndexSearcher searcher = new IndexSearcher(reader); - CollectionStatistics collectionStatistics = searcher.collectionStatistics(Indexer.FIELD_CONTENTS); - +// if(this.tag == null) +// indexPath = Files.newDirectoryStream(dataset.indexesPath(), Files::isDirectory).iterator().next(); +// else { +// try (DirectoryStream stream = Files.newDirectoryStream(dataset.indexesPath(), Files::isDirectory)) { +// for (Path path : stream) { +// if(!tag.equals(path.getFileName().toString())) continue; +// indexPath = path; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// +// if(indexPath == null) +// throw new RuntimeException(tag + " index not found"); +// +// +// this.indexTag = indexPath.getFileName().toString(); +// this.analyzerTag = Tag.tag(indexTag); + this.analyzerTag = Tag.tag(tag); + +// this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); + + IndexSearcher searcher = null; +// IndexSearcher searcher = new IndexSearcher(reader); + CollectionStatistics collectionStatistics = null; +// CollectionStatistics collectionStatistics = searcher.collectionStatistics(Indexer.FIELD_CONTENTS); + + Bert bert = null; List features = new ArrayList<>(); @@ -272,6 +277,7 @@ public void run(Properties props) throws Exception { features.add(new SimContentTitle(type)); features.add(new SimTitleH(type)); features.add(new SimTitleKeyword(type)); + bert = Bert.load("com/robrua/nlp/easy-bert/bert-uncased-L-12-H-768-A-12"); } } } @@ -297,8 +303,8 @@ public void run(Properties props) throws Exception { features.add(new CDD()); } - Traverser traverser = new Traverser(dataset, docsPath, docIdSet, features, collectionStatistics, analyzerTag, searcher, reader, resultsettype); - System.out.println("Average Doc Len = "+(double)collectionStatistics.sumTotalTermFreq()/collectionStatistics.docCount()); + Traverser traverser = new Traverser(dataset, docsPath, docIdSet, features, collectionStatistics, analyzerTag, searcher, reader, resultsettype, bert); +// System.out.println("Average Doc Len = "+(double)collectionStatistics.sumTotalTermFreq()/collectionStatistics.docCount()); final int numThreads = props.containsKey("numThreads") ? Integer.parseInt(props.getProperty("numThreads")) : Runtime.getRuntime().availableProcessors(); System.out.println(numThreads + " threads are running."); diff --git a/src/main/java/edu/anadolu/ltr/SimContentDescription.java b/src/main/java/edu/anadolu/ltr/SimContentDescription.java index c3a1a67..2462f9c 100644 --- a/src/main/java/edu/anadolu/ltr/SimContentDescription.java +++ b/src/main/java/edu/anadolu/ltr/SimContentDescription.java @@ -25,7 +25,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.listContent, base.description); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.listContent),String.join(" ",base.description)); + return base.bertSim(base.vectorlistContent,base.vectordescription); return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.description)); } } \ No newline at end of file diff --git a/src/main/java/edu/anadolu/ltr/SimContentH.java b/src/main/java/edu/anadolu/ltr/SimContentH.java index 85098ee..28426bc 100644 --- a/src/main/java/edu/anadolu/ltr/SimContentH.java +++ b/src/main/java/edu/anadolu/ltr/SimContentH.java @@ -27,7 +27,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.listContent, base.hTags); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.listContent),String.join(" ",base.hTags)); + return base.bertSim(base.vectorlistContent,base.vectorhTags); return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.hTags)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimContentKeyword.java b/src/main/java/edu/anadolu/ltr/SimContentKeyword.java index c2f806b..a932bda 100644 --- a/src/main/java/edu/anadolu/ltr/SimContentKeyword.java +++ b/src/main/java/edu/anadolu/ltr/SimContentKeyword.java @@ -24,7 +24,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.listContent, base.keyword); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.listContent),String.join(" ",base.keyword)); + return base.bertSim(base.vectorlistContent,base.vectorkeyword); return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.keyword)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimContentTitle.java b/src/main/java/edu/anadolu/ltr/SimContentTitle.java index 122e558..4fb8b88 100644 --- a/src/main/java/edu/anadolu/ltr/SimContentTitle.java +++ b/src/main/java/edu/anadolu/ltr/SimContentTitle.java @@ -24,7 +24,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.listContent, base.title); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.listContent),String.join(" ",base.title)); + return base.bertSim(base.vectorlistContent,base.vectortitle); return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.title)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimDescriptionH.java b/src/main/java/edu/anadolu/ltr/SimDescriptionH.java index 6d723a5..9e01227 100644 --- a/src/main/java/edu/anadolu/ltr/SimDescriptionH.java +++ b/src/main/java/edu/anadolu/ltr/SimDescriptionH.java @@ -27,7 +27,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.description, base.hTags); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.description),String.join(" ",base.hTags)); + return base.bertSim(base.vectordescription,base.vectorhTags); return base.cosSim(String.join(" ",base.description),String.join(" ",base.hTags)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimKeywordDescription.java b/src/main/java/edu/anadolu/ltr/SimKeywordDescription.java index 89c7591..3e323a0 100644 --- a/src/main/java/edu/anadolu/ltr/SimKeywordDescription.java +++ b/src/main/java/edu/anadolu/ltr/SimKeywordDescription.java @@ -24,7 +24,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.keyword, base.description); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.keyword),String.join(" ",base.description)); + return base.bertSim(base.vectorkeyword,base.vectordescription); return base.cosSim(String.join(" ",base.keyword),String.join(" ",base.description)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimKeywordH.java b/src/main/java/edu/anadolu/ltr/SimKeywordH.java index 21097e5..13fc249 100644 --- a/src/main/java/edu/anadolu/ltr/SimKeywordH.java +++ b/src/main/java/edu/anadolu/ltr/SimKeywordH.java @@ -26,7 +26,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.keyword, base.hTags); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.keyword),String.join(" ",base.hTags)); + return base.bertSim(base.vectorkeyword,base.vectorhTags); return base.cosSim(String.join(" ",base.keyword),String.join(" ",base.hTags)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimTitleDescription.java b/src/main/java/edu/anadolu/ltr/SimTitleDescription.java index e874dc2..831d25d 100644 --- a/src/main/java/edu/anadolu/ltr/SimTitleDescription.java +++ b/src/main/java/edu/anadolu/ltr/SimTitleDescription.java @@ -24,7 +24,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.title, base.description); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.title),String.join(" ",base.description)); + return base.bertSim(base.vectortitle,base.vectordescription); return base.cosSim(String.join(" ",base.title),String.join(" ",base.description)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimTitleH.java b/src/main/java/edu/anadolu/ltr/SimTitleH.java index dbe4f09..b9030ac 100644 --- a/src/main/java/edu/anadolu/ltr/SimTitleH.java +++ b/src/main/java/edu/anadolu/ltr/SimTitleH.java @@ -26,7 +26,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.title, base.hTags); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.title),String.join(" ",base.hTags)); + return base.bertSim(base.vectortitle, base.vectorhTags); return base.cosSim(String.join(" ",base.title),String.join(" ",base.hTags)); } } diff --git a/src/main/java/edu/anadolu/ltr/SimTitleKeyword.java b/src/main/java/edu/anadolu/ltr/SimTitleKeyword.java index 3f457c6..879b649 100644 --- a/src/main/java/edu/anadolu/ltr/SimTitleKeyword.java +++ b/src/main/java/edu/anadolu/ltr/SimTitleKeyword.java @@ -24,7 +24,7 @@ public String toString() { public double calculate(DocFeatureBase base) throws IOException, NullPointerException { // return base.textSimilarity(base.title, base.keyword); if("bert".equals(this.type)) - return base.bertSim(String.join(" ",base.title),String.join(" ",base.keyword)); + return base.bertSim(base.vectortitle,base.vectorkeyword); return base.cosSim(String.join(" ",base.title),String.join(" ",base.keyword)); } } diff --git a/src/main/java/edu/anadolu/ltr/Traverser.java b/src/main/java/edu/anadolu/ltr/Traverser.java index 71532a4..462ecad 100644 --- a/src/main/java/edu/anadolu/ltr/Traverser.java +++ b/src/main/java/edu/anadolu/ltr/Traverser.java @@ -195,8 +195,9 @@ protected boolean skip(String docId) { private IndexSearcher searcher; private IndexReader reader; private String resultsettype; + private Bert bert; - Traverser(DataSet dataset, String docsDir, Set docIdSet, List featureList, CollectionStatistics collectionStatistics, Tag analyzerTag, IndexSearcher searcher, IndexReader reader, String resultsettype) { + Traverser(DataSet dataset, String docsDir, Set docIdSet, List featureList, CollectionStatistics collectionStatistics, Tag analyzerTag, IndexSearcher searcher, IndexReader reader, String resultsettype, Bert bert) { this.collection = dataset.collection(); this.docIdSet = docIdSet; this.featureList = featureList; @@ -205,6 +206,7 @@ protected boolean skip(String docId) { this.searcher = searcher; this.reader = reader; this.resultsettype=resultsettype; + this.bert=bert; docsPath = Paths.get(docsDir); if (!Files.exists(docsPath) || !Files.isReadable(docsPath) || !Files.isDirectory(docsPath)) { @@ -218,7 +220,7 @@ protected boolean skip(String docId) { * Traverse based on Java8's parallel streams */ void traverseParallel(Path resultPath, int numThreads) throws IOException { - Bert bert = Bert.load("com/robrua/nlp/easy-bert/bert-uncased-L-12-H-768-A-12"); + // RelatednessCalculator rc1 = new WuPalmer(new NictWordNet());