Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmetaydin123 committed Apr 26, 2021
2 parents 5274731 + 6159524 commit 9b3d7cb
Show file tree
Hide file tree
Showing 19 changed files with 82 additions and 106 deletions.
2 changes: 1 addition & 1 deletion src/main/java/edu/anadolu/analysis/Analyzers.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ private static Analyzer anlyzr(Tag tag) throws IOException {
.addTokenFilter("englishpossessive")
.addTokenFilter("snowballporter", "language", "English")
.build();

case ICU:
return CustomAnalyzer.builder()
.withTokenizer("icu")
Expand Down
10 changes: 7 additions & 3 deletions src/main/java/edu/anadolu/ltr/CDD.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,23 @@ public double calculate(DocFeatureBase base) throws IOException {
// Formula is borrowed from the paper : Zhou, Y., & Croft, W. B. (2005, October). Document quality models for web ad hoc retrieval.
double cdd = 0.0;
double lambda = 0.8;



int contentsize = base.listContent.size();
long totalTerm = base.collectionStatistics.sumTotalTermFreq();
boolean nan = false;
for(Map.Entry<String,Integer> word : base.mapTf.entrySet()){
Term term = new Term(Indexer.FIELD_CONTENTS, word.getKey());
if(TermContext.build(base.reader.getContext(), term).totalTermFreq()==0) continue;
double pColl = (double) TermContext.build(base.reader.getContext(), term).totalTermFreq()/totalTerm;
double pDoc = (double) word.getValue()/ contentsize;
double pwd = (lambda * pDoc) + ((1-lambda) * pColl);
cdd += pColl * Math.log(pColl/pwd);
if(pwd==0 || Double.isNaN(pwd)) continue;
double result = pColl * Math.log(pColl/pwd);
cdd += result;
}



// if(cdd>10 || cdd<-10){
// System.out.println("****************************************************************************************************************************************");
// System.out.println("Doc Id = " + base.docId + " CDD : " + cdd);
Expand Down
7 changes: 3 additions & 4 deletions src/main/java/edu/anadolu/ltr/DocFeatureBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,14 @@ public class DocFeatureBase {

String calculate(List<IDocFeature> featureList) throws IOException {

// if(StringUtils.isEmpty(url)) return null;

StringBuilder builder = new StringBuilder();
builder.append(docId);
// long start2=0;
// builder.append("\t").append(url);
for (IDocFeature iDoc : featureList) {
// start2 = System.nanoTime();
double value = iDoc.calculate(this);
builder.append("\t").append(iDoc.toString()).append(":").append(String.format("%.5f", value));
// if((System.nanoTime()-start2)>1000000000)
// System.out.println(iDoc.getClass().getSimpleName() + " "+ (System.nanoTime()-start2)/1000000000);
}
return builder.toString();
}
Expand Down
79 changes: 28 additions & 51 deletions src/main/java/edu/anadolu/ltr/SEOTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,11 @@ public void run(Properties props) throws Exception {
}
}

System.out.println(docIdSet.size() + " docs will be processed.");
// docIdSet.removeAll(retrieveDocIdSetFromExisting(Paths.get("Seo12B.txt")));


System.out.println(docIdSet.size() + " docs will be processed.");

DataSet dataset = CollectionFactory.dataset(collection, tfd_home);
long start = System.nanoTime();

Expand Down Expand Up @@ -175,7 +178,6 @@ public void run(Properties props) throws Exception {

List<IDocFeature> features = new ArrayList<>();
if(type.equals("seo")){
if(seopart==null) {
features.add(new Contact());
features.add(new ContentLengthOver1800());
features.add(new Copyright());
Expand Down Expand Up @@ -214,48 +216,6 @@ public void run(Properties props) throws Exception {

features.add(new SimTitleH());
features.add(new SimTitleKeyword());
}else{
if(seopart.equals("1")){
features.add(new Contact());
features.add(new ContentLengthOver1800());
features.add(new Copyright());
features.add(new Description());
features.add(new Favicon());
features.add(new Https());
features.add(new Keyword());
features.add(new KeywordInDomain());
features.add(new KeywordInFirst100Words());
features.add(new KeywordInImgAltTag());
features.add(new KeywordInTitle());
features.add(new Robots());
features.add(new SocialMediaShare());
features.add(new Viewport());
features.add(new AlttagToImg());
features.add(new ContentLengthToMax());
features.add(new HdensityToMax());
features.add(new ImgToMax());
features.add(new IndexOfKeywordInTitle());
features.add(new InOutlinkToAll());
features.add(new UrlLength());
features.add(new MetaTagToMax());
features.add(new NoFollowToAll());
}else if(seopart.equals("2")){
features.add(new SimDescriptionH());
features.add(new SimContentDescription());
}else if(seopart.equals("3")){
features.add(new SimKeywordDescription());
features.add(new SimContentH());
}else if(seopart.equals("4")){
features.add(new SimKeywordH());
features.add(new SimContentKeyword());
}else if(seopart.equals("5")){
features.add(new SimTitleDescription());
features.add(new SimContentTitle());
}else if(seopart.equals("6")){
features.add(new SimTitleH());
features.add(new SimTitleKeyword());
}
}
}else if(type.equals("doc")){
features.add(new NumberOfChildPages(collection));
features.add(new InLinkCount(collection));
Expand Down Expand Up @@ -336,6 +296,29 @@ private Set<String> retrieveDocIdSetFromResultset(Path file) throws IOException

return docIdSet;
}
private Set<String> retrieveDocIdSetFromExisting(Path file) throws IOException {

Set<String> docIdSet = new HashSet<>();
List<String> lines = Files.readAllLines(file);

for (int i=0;i<lines.size();i++) {

String line = lines.get(i);

if (line.startsWith("#")) continue;


String docId = Track.whiteSpaceSplitter.split(line)[0];

docIdSet.add(docId);
}

System.out.println("Existing docs "+docIdSet.size());

lines.clear();

return docIdSet;
}


private Set<String> retrieveDocIdSetForLetor(Path file) throws IOException {
Expand All @@ -347,13 +330,7 @@ private Set<String> retrieveDocIdSetForLetor(Path file) throws IOException {

if (line.startsWith("#")) continue;

int i = line.indexOf("GX");

if (i == -1) {
throw new RuntimeException("cannot find # in " + line);
}

String docId = line.substring(i, line.indexOf(" ", i)).trim();
String docId = Track.whiteSpaceSplitter.split(line)[2];

docIdSet.add(docId);
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimContentDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.listContent, base.description);
// return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.description));
// return base.textSimilarity(base.listContent, base.description);
return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.description));
}
}
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimContentH.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.listContent, base.hTags);
// return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.hTags));
// return base.textSimilarity(base.listContent, base.hTags);
return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.hTags));
}
}
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimContentKeyword.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.listContent, base.keyword);
// return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.keyword));
// return base.textSimilarity(base.listContent, base.keyword);
return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.keyword));
}
}
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimContentTitle.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.listContent, base.title);
// return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.title));
// return base.textSimilarity(base.listContent, base.title);
return base.cosSim(String.join(" ",base.listContent),String.join(" ",base.title));
}
}
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimDescriptionH.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.description, base.hTags);
// return base.cosSim(String.join(" ",base.description),String.join(" ",base.hTags));
// return base.textSimilarity(base.description, base.hTags);
return base.cosSim(String.join(" ",base.description),String.join(" ",base.hTags));
}
}

4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimKeywordDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.keyword, base.description);
// return base.cosSim(String.join(" ",base.keyword),String.join(" ",base.description));
// return base.textSimilarity(base.keyword, base.description);
return base.cosSim(String.join(" ",base.keyword),String.join(" ",base.description));
}
}
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimKeywordH.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.keyword, base.hTags);
// return base.cosSim(String.join(" ",base.keyword),String.join(" ",base.hTags));
// return base.textSimilarity(base.keyword, base.hTags);
return base.cosSim(String.join(" ",base.keyword),String.join(" ",base.hTags));
}
}

4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimTitleDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.title, base.description);
// return base.cosSim(String.join(" ",base.title),String.join(" ",base.description));
// return base.textSimilarity(base.title, base.description);
return base.cosSim(String.join(" ",base.title),String.join(" ",base.description));
}
}
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimTitleH.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.title, base.hTags);
// return base.cosSim(String.join(" ",base.title),String.join(" ",base.hTags));
// return base.textSimilarity(base.title, base.hTags);
return base.cosSim(String.join(" ",base.title),String.join(" ",base.hTags));
}
}

4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/SimTitleKeyword.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public String toString() {

@Override
public double calculate(DocFeatureBase base) throws IOException, NullPointerException {
return base.textSimilarity(base.title, base.keyword);
// return base.cosSim(String.join(" ",base.title),String.join(" ",base.keyword));
// return base.textSimilarity(base.title, base.keyword);
return base.cosSim(String.join(" ",base.title),String.join(" ",base.keyword));
}
}
4 changes: 3 additions & 1 deletion src/main/java/edu/anadolu/ltr/Traverser.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,12 @@ private int processWarcRecord(WarcRecord warcRecord) {
return 0;

String id = warcRecord.id();

if (skip(id)) return 0;
DocFeatureBase base = new DocFeatureBase(warcRecord, collectionStatistics, analyzerTag, searcher, reader, rc1);
try {
String line = base.calculate(featureList);
if(line==null) return 1;
out.get().println(line);
// System.out.println(line);
} catch (Exception ex) {
Expand Down Expand Up @@ -294,7 +296,7 @@ void traverseParallel(Path resultPath, int numThreads) throws IOException {
}catch (Exception ex){
ex.printStackTrace();
}
}else if(collection.equals(Collection.CW12A)||collection.equals(Collection.CW12B)){
}else if(collection.equals(Collection.CW12A)||collection.equals(Collection.CW12B)||collection.equals(Collection.NTCIR)){
if("all".equals(resultsettype)){
try (Stream<Path> stream = Files.find(docsPath, 4, new WarcMatcher(suffix))) {

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/edu/anadolu/ltr/TraverserForQD.java
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public void run() {

Thread.currentThread().setName(inputWarcFile.toAbsolutePath().toString());

if (Collection.CW09A.equals(collection) || Collection.CW09B.equals(collection)) {
if (Collection.CW09A.equals(collection) || Collection.CW09B.equals(collection) || Collection.MQ09.equals(collection)) {
int addCount = processClueWeb09WarcFile();
//System.out.println("*./" + inputWarcFile.getParent().getFileName().toString() + File.separator + inputWarcFile.getFileName().toString() + " " + addCount);
} else if (Collection.CW12A.equals(collection) || Collection.CW12B.equals(collection) || Collection.NTCIR.equals(collection)) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/edu/anadolu/ltr/URLWiki.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ public double calculate(DocFeatureBase base) {

return host.contains("wikipedia")?1.0:0.0;
} catch (URISyntaxException e) {
System.out.println("url syntax: " + base.url);
System.out.println("wiki url syntax: " + base.url);
} catch (NullPointerException e1){
System.out.println("null url : " + base.url);
System.out.println("wiki null url : " + base.url);
}

return 0.0;
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/edu/anadolu/similarities/DFRee.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ public double score(double tf, long docLength, double averageDocumentLength, dou
double norm = tf * log2(posterior / prior);

return keyFrequency * norm * (
tf * (
-log2(prior * InvPriorCollection)
tf * (-log2(prior * InvPriorCollection)
)
+
(tf + 1d) * (
Expand Down
37 changes: 16 additions & 21 deletions src/main/java/org/clueweb09/tracks/WWW15.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,24 @@
import java.io.IOException;
import java.nio.file.Paths;

/**
* The NTCIR-15 WWW-3 English Subtask
* http://sakailab.com/www3english/
*/
public class WWW15 extends WWW13 {
public class WWW15 extends WWW13{

@Override
protected int offset() {
return 100;
}
@Override
protected int offset() {
return 200;
}

@Override
protected void populateInfoNeeds() throws IOException {
populateInfoNeedsWWW(Paths.get(home, "topics-and-qrels", "www3topics-E.xml"));
}
@Override
protected void populateInfoNeeds() throws IOException {
populateInfoNeedsWWW(Paths.get(home, "topics-and-qrels", "www3topics-E.xml"));
}

@Override
protected void populateQRelsMap() throws IOException {
// tail -n -16677 ntcir15www2+3official.qrels >> ntcir15www3.qrels
populateQRelsMap(Paths.get(home, "topics-and-qrels", "ntcir15www3.qrels"));
}
@Override
protected void populateQRelsMap() throws IOException {
populateQRelsMap(Paths.get(home, "topics-and-qrels", "www3e.qrels"));
}

public WWW15(String home) {
super(home, Paths.get(home, "topics-and-qrels", "qrels.www.201-280.txt"));
}
public WWW15(String home) {
super(home, Paths.get(home, "topics-and-qrels", "qrels.www.201-280.txt"));
}
}

0 comments on commit 9b3d7cb

Please sign in to comment.