Skip to content

Commit

Permalink
Started integration between crawler and indexer
Browse files Browse the repository at this point in the history
  • Loading branch information
MohamedWalid1 committed Jun 5, 2021
1 parent e40e6c5 commit 145cfb3
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 191 deletions.
155 changes: 0 additions & 155 deletions Test.txt
Original file line number Diff line number Diff line change
@@ -1,155 +0,0 @@
https://www.linkedin.com
https://www.javatpoint.com/java-tutorial
https://www.javatpoint.com/exception-handling-in-java
https://www.javatpoint.com/javafx-tutorial
https://www.javatpoint.com/corejava-interview-questions
https://www.javatpoint.com/New-features-in-java
https://www.javatpoint.com/opr/test.jsp?filename=RegexExample1
https://www.javatpoint.com/opr/test.jsp?filename=RegexExample6
https://www.javatpoint.com/soapui
https://www.javatpoint.com/rpa
https://www.javatpoint.com/postgresql-tutorial
https://www.javatpoint.com/verilog
https://www.javatpoint.com/interview-questions-and-answers
https://www.javatpoint.com/aws-tutorial
https://www.javatpoint.com/reactjs-tutorial
https://www.javatpoint.com/machine-learning
https://www.javatpoint.com/os-tutorial
https://www.javatpoint.com/discrete-mathematics-tutorial
https://www.javatpoint.com/automata-tutorial
https://www.javatpoint.com/programs-list
https://www.javatpoint.com/data-mining
https://www.youtube.com/channel/UCUnYvQVCrJoFWZhKK3O2xLg
https://javatpoint.blogspot.com
https://www.lyricsia.com
https://www.javatpoint.com/subscribe.jsp
https://www.javatpoint.com/privacy-policy
https://www.linkedin.com/signup/cold-join?trk=guest_homepage-basic_nav-header-join
https://www.javatpoint.com/sonoo-jaiswal
https://www.linkedin.com/jobs/administrative-assistant-jobs-giza?trk=homepage-basic_suggested-search
https://www.linkedin.com/jobs/marketing-jobs-giza?trk=homepage-basic_suggested-search
https://www.linkedin.com/jobs/sales-jobs-giza?trk=homepage-basic_suggested-search
https://www.linkedin.com/jobs/consulting-jobs-giza?trk=homepage-basic_suggested-search
https://www.linkedin.com/jobs/military-and-protective-services-jobs-giza?trk=homepage-basic_suggested-search
https://www.linkedin.com/jobs/research-jobs-giza?trk=homepage-basic_suggested-search
https://www.linkedin.com/learning/topics/training-and-education?trk=homepage-basic_learning-cta
https://www.g9g.com/top.php
https://www.linkedin.com/learning/topics/marketing-2?trk=homepage-basic_learning-cta
https://www.g9g.com/register.php
https://www.linkedin.com/learning/topics/aec?trk=homepage-basic_learning-cta
https://www.linkedin.com/learning/topics/photography-2?trk=homepage-basic_learning-cta
https://www.linkedin.com/learning/topics/user-experience?trk=homepage-basic_learning-cta
https://www.linkedin.com/learning/topics/product-and-manufacturing?trk=homepage-basic_learning-cta
https://www.linkedin.com/learning/topics/devops?trk=homepage-basic_learning-cta
https://www.g9g.com/cat9.htm
https://www.linkedin.com/signup/cold-join?trk=guest_homepage-basic_directory
https://www.linkedin.com/help/linkedin?lang=en&trk=homepage-basic_directory_helpCenterUrl
https://www.g9g.com/game967.htm
https://mobile.linkedin.com/?trk=homepage-basic_directory_mobileMicrositeUrl
https://business.linkedin.com/sales-solutions?src=li-footer&utm_source=linkedin&utm_medium=footer&trk=homepage-basic_directory_salesSolutionsMicrositeUrl
https://www.linkedin.com/directory/companies?trk=homepage-basic_directory_companyDirectoryUrl
https://www.g9g.com/game959.htm
https://www.linkedin.com/directory/articles?trk=homepage-basic_directory_articlesDirectoryUrl
https://www.linkedin.com/directory/newsletters?trk=homepage-basic_directory_newslettersDirectoryUrl
https://www.g9g.com/game953.htm
https://www.linkedin.com/legal/user-agreement?trk=homepage-basic_footer-user-agreement
https://www.linkedin.com/psettings/guest-controls?trk=homepage-basic_footer-guest-controls
https://www.g9g.com/game148.htm
https://www.javatpoint.com/android-tutorial
https://www.javatpoint.com/jira-tutorial
https://www.javatpoint.com/testng-tutorial
https://www.javatpoint.com/junit-tutorial
https://www.javatpoint.com/sqlite-tutorial
https://www.javatpoint.com/mariadb-tutorial
https://www.javatpoint.com/memcached-tutorial
https://www.javatpoint.com/json-tutorial
https://www.javatpoint.com/expressjs-tutorial
https://www.javatpoint.com/xslt-tutorial
https://www.javatpoint.com/pure-css-tutorial
https://www.javatpoint.com/ejb-tutorial
https://www.javatpoint.com/design-patterns-in-java
https://www.javatpoint.com/richfaces-tutorial
https://www.javatpoint.com/java-8-features
https://www.javatpoint.com/pdfbox-tutorial
https://www.javatpoint.com/primary-data-vs-secondary-data
https://github.com/features/actions
https://www.javatpoint.com/herbivores-vs-carnivores
https://www.javatpoint.com/diagonal-traversal-of-binary-tree
https://github.com/features/integrations
https://github.com/team
https://www.javatpoint.com/what-is-data-scraping
https://www.javatpoint.com/best-gaming-phones
https://opensource.guide
https://www.javatpoint.com/create-url-shortener-in-laravel
https://github.com/events
https://www.javatpoint.com/range-of-int-in-c
https://stars.github.com
https://www.javatpoint.com/free-nosql-databases
https://www.javatpoint.com/kilobit
https://www.javatpoint.com/add-highcharts-using-angular-9-8
https://github.com/login
https://github.com/#home-collaborate
https://www.javatpoint.com/group-discussion-topics-on-business-and-economy
https://github.com/#home-community
https://www.npmjs.com
https://www.javatpoint.com/table-variable-in-sql-server
https://www.javatpoint.com/what-is-cvv-code
https://www.javatpoint.com/what-is-data-hiding
https://www.javatpoint.com/angular-9-8-routing-and-nested-routing
https://github.com/join?ref_cta=Sign+up+for+GitHub&ref_loc=footer+launchpad&ref_page=%2F
https://enterprise.github.com/contact?ref_page=/&ref_cta=Contact%20Sales&ref_loc=footer%20launchpad
https://www.javatpoint.com/graphs-examples
https://github.com/flutter/flutter
https://www.javatpoint.com/java-data-types
https://www.javatpoint.com/unicode-system-in-java
https://www.javatpoint.com/java-keywords
https://www.javatpoint.com/java-if-else
https://github.com/ohmyzsh/ohmyzsh
https://www.javatpoint.com/what-is-the-formula-for-perimeter
https://www.javatpoint.com/swap-two-numbers-in-java-using-function
https://www.javatpoint.com/object-and-class-in-java
https://www.javatpoint.com/method-in-java
https://www.javatpoint.com/small-intestine-vs-large-intestine
https://www.javatpoint.com/bootstrap-collapse-using-angular-9-8
https://services.github.com/
https://www.javatpoint.com/method-overloading-in-java
https://www.javatpoint.com/covariant-return-type
https://github.blog
https://github.com/about/diversity
https://www.javatpoint.com/static-binding-and-dynamic-binding
https://www.javatpoint.com/data-hiding-in-python
https://www.javatpoint.com/difference-between-abstract-class-and-interface
https://www.javatpoint.com/access-modifiers
https://www.youtube.com/github
https://www.javatpoint.com/encapsulation
https://www.linkedin.com/company/github
https://www.javatpoint.com/object-class
https://www.javatpoint.com/java-math
https://github.com/site-map
https://github.com/git-guides
https://www.javatpoint.com/software-testing-vs-embedded-testing
https://www.javatpoint.com/creating-api-document
https://www.javatpoint.com/difference-between-object-and-class
https://www.javatpoint.com/proxy-server-list
https://www.javatpoint.com/bootstrap3-footer
https://www.javatpoint.com/java-main-method
https://www.javatpoint.com/flatten-list-in-python
https://www.javatpoint.com/json-compare
https://www.javatpoint.com/java-programs#java-basic-programs
https://www.javatpoint.com/delirium-vs-dementia
https://www.javatpoint.com/servicenow-interview-questions
https://www.javatpoint.com/java-programs#java-string-programs
https://www.javatpoint.com/how-to-calculate-the-square-root
https://www.javatpoint.com/bootstrap-modal-popup-in-angular-9-8
https://www.javatpoint.com/sql-server-rename-table
https://www.javatpoint.com/sum-of-numbers-in-java
https://www.javatpoint.com/what-is-excel
https://www.javatpoint.com/centre-of-mass-vs-centre-of-gravity
https://www.javatpoint.com/prime-number-program-in-java
https://www.javatpoint.com/factorial-program-in-java
https://www.javatpoint.com/component-testing-vs-unit-testing
https://www.javatpoint.com/how-to-create-object-in-java
https://www.javatpoint.com/how-to-reverse-a-number-in-java
https://www.javatpoint.com/automorphic-number-program-in-java
https://www.javatpoint.com/destructors-in-python
https://www.javatpoint.com/fascinating-n
Empty file removed robots1.txt
Empty file.
53 changes: 50 additions & 3 deletions src/com/company/Crawler/Database.java
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
package com.company.Crawler;
import com.company.Indexer.Website;
import com.company.Indexer.Word;
import org.bson.Document;
import org.bson.types.ObjectId;
import com.mongodb.*;
Expand All @@ -21,6 +23,8 @@


import javax.swing.*;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.LinkedList;
Expand All @@ -33,8 +37,9 @@
public class Database {
DB crawlerDatabase;
MongoClient mongoClient;
DBCollection websites ;
DBCollection websites;
DBCollection disallowedWebsite;
DBCollection IndexerCollection;
//DBCollection hyberlinks;
//DBCollection htmlDocuments;
//DBCollection time;
Expand Down Expand Up @@ -74,7 +79,11 @@ public Database() {
//htmlDocuments = crawlerDatabase.getCollection("HTMLDocuments");
//time = crawlerDatabase.getCollection("time");


IndexerCollection = crawlerDatabase.getCollection("IndexerCollection");
IndexerCollection.createIndex("Value");
IndexerCollection.createIndex("DF");
IndexerCollection.createIndex("ListOfDocuments");
IndexerCollection.createIndex("Time");
}

public void AddVisited(String website, String time){
Expand Down Expand Up @@ -118,10 +127,26 @@ public void AddDisallowed(String website1, String time){
.append("rank",(double) 0.0)
.append("Time",time);

disallowedWebsite.insert(row);}
disallowedWebsite.insert(row);
}

}

public void AddIndexed(Word word, String time)
{
DBObject SearchQ = new BasicDBObject("Value", word.getValue());

if(IndexerCollection.find(SearchQ).count() == 0)
{
BasicDBObject row = new BasicDBObject("Value", word.getValue())
.append("DF", word.getDF())
.append("ListOfDocuments",word.getListOfDocuments())
.append("Time",time);

IndexerCollection.insert(row);
}
}

public void Update(String str, String time)
{
DBObject SearchQ = new BasicDBObject("URL", str);
Expand All @@ -133,4 +158,26 @@ public void Update(String str, String time)

}

public void UpdateIndex(String str, String time)
{
DBObject SearchQ = new BasicDBObject("URL", str);
DBObject ObjectQ = new BasicDBObject("indexed", 1)
.append("Time",time);
DBObject UpdateQ = new BasicDBObject("$set",ObjectQ);
if(websites.find(SearchQ).count() != 0)
websites.update(SearchQ, UpdateQ);

}

public void getCrawled(LinkedList<Website> Visited) throws MalformedURLException, FileNotFoundException {
DBCursor cur = websites.find(new BasicDBObject("crawled", 1).append("indexed",0));
int size = cur.size();
for(int i = 0 ;i< size;i++) {
DBObject doc = cur.next();
String URL = (String) doc.get("URL");
Website w = new Website(URL, i);
Visited.add(w);
}
}

}
4 changes: 2 additions & 2 deletions src/com/company/Indexer/Website.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ public Website(String str,Integer ID) throws MalformedURLException, FileNotFound
if (w.getHost() != null) {
con.getResponseCode();

org.jsoup.nodes.Document doc = Jsoup.connect(str).userAgent("Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2").followRedirects(true).method(Connection.Method.GET).timeout(1200000).ignoreHttpErrors(true).get();
this.html = doc;
//org.jsoup.nodes.Document doc = Jsoup.connect(str).userAgent("Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2").followRedirects(true).method(Connection.Method.GET).timeout(1200000).ignoreHttpErrors(true).get();
//this.html = doc;
}
} catch (IOException e) {
e.printStackTrace();
Expand Down
Loading

0 comments on commit 145cfb3

Please sign in to comment.