diff --git a/.idea/libraries/Maven__aopalliance_aopalliance_1_0.xml b/.idea/libraries/Maven__aopalliance_aopalliance_1_0.xml new file mode 100644 index 0000000..30ff5cb --- /dev/null +++ b/.idea/libraries/Maven__aopalliance_aopalliance_1_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml b/.idea/libraries/Maven__org_mongodb_mongo_java_driver_3_12_8.xml similarity index 59% rename from .idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml rename to .idea/libraries/Maven__org_mongodb_mongo_java_driver_3_12_8.xml index d375251..b9089b5 100644 --- a/.idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml +++ b/.idea/libraries/Maven__org_mongodb_mongo_java_driver_3_12_8.xml @@ -1,13 +1,13 @@ - + - + - + - + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_10.xml b/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_10.xml new file mode 100644 index 0000000..5f42bd6 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_10.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_30.xml b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_30.xml new file mode 100644 index 0000000..02b6812 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_30.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_simple_1_7_30.xml b/.idea/libraries/Maven__org_slf4j_slf4j_simple_1_7_30.xml new file mode 100644 index 0000000..e5856a0 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_simple_1_7_30.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_data_spring_data_commons_1_10_0_RELEASE.xml b/.idea/libraries/Maven__org_springframework_data_spring_data_commons_1_10_0_RELEASE.xml new file mode 100644 index 0000000..814cd16 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_data_spring_data_commons_1_10_0_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_data_spring_data_mongodb_1_7_0_RELEASE.xml b/.idea/libraries/Maven__org_springframework_data_spring_data_mongodb_1_7_0_RELEASE.xml new file mode 100644 index 0000000..0f80305 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_data_spring_data_mongodb_1_7_0_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_aop_4_0_9_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_aop_4_0_9_RELEASE.xml new file mode 100644 index 0000000..b9f0278 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_aop_4_0_9_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_beans_4_0_9_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_beans_4_0_9_RELEASE.xml new file mode 100644 index 0000000..f98ea8c --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_beans_4_0_9_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_context_4_0_9_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_context_4_0_9_RELEASE.xml new file mode 100644 index 0000000..802cb82 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_context_4_0_9_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_core_4_0_9_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_core_4_0_9_RELEASE.xml new file mode 100644 index 0000000..1019cb9 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_core_4_0_9_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_expression_4_0_9_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_expression_4_0_9_RELEASE.xml new file mode 100644 index 0000000..20892fa --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_expression_4_0_9_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_tx_4_0_9_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_tx_4_0_9_RELEASE.xml new file mode 100644 index 0000000..43aa1ad --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_tx_4_0_9_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/Search_Engine.iml b/Search_Engine.iml index c2cd65a..e820eb3 100644 --- a/Search_Engine.iml +++ b/Search_Engine.iml @@ -23,7 +23,7 @@ - + @@ -32,5 +32,17 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/Test.txt b/Test.txt new file mode 100644 index 0000000..e69de29 diff --git a/pom.xml b/pom.xml index a5d9d18..29f1cc6 100644 --- a/pom.xml +++ b/pom.xml @@ -19,11 +19,13 @@ mongodb-driver-sync 3.12.8 + org.mongodb mongo-java-driver - 2.13.3 + 3.12.8 + org.jsoup @@ -53,6 +55,27 @@ lucene-analyzers-common 4.10.4 + + org.springframework.data + spring-data-mongodb + 1.7.0.RELEASE + + + + org.slf4j + slf4j-api + 1.7.30 + + + + + + org.slf4j + slf4j-simple + 1.7.30 + test + + diff --git a/src/com/company/Crawler/Database.java b/src/com/company/Crawler/Database.java index f4376ed..367853d 100644 --- a/src/com/company/Crawler/Database.java +++ b/src/com/company/Crawler/Database.java @@ -13,6 +13,12 @@ import com.mongodb.MongoClient; import com.mongodb.ParallelScanOptions; import com.mongodb.ServerAddress; +import org.jsoup.Connection; +import org.jsoup.HttpStatusException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + import javax.swing.*; import java.net.UnknownHostException; @@ -33,30 +39,24 @@ public class Database { //DBCollection htmlDocuments; //DBCollection time; public Database() { - try { - this.mongoClient = new MongoClient("localhost", 27017); - //create database - this.crawlerDatabase = mongoClient.getDB("CrawlerDatabase"); - //create collections and fields - websites = crawlerDatabase.getCollection("websites"); - websites.createIndex("URL"); - websites.createIndex("crawled"); - websites.createIndex("indexed"); - websites.createIndex("rank"); - //websites.createIndex("hyberlinks"); - websites.createIndex("HTMLDocuments"); - websites.createIndex("Time"); - disallowedWebsite=crawlerDatabase.getCollection("DisallowedWebsites"); - //hyberlinks = crawlerDatabase.getCollection("hyberlinks"); - //hyberlinks.createIndex("URL"); - //hyberlinks.createIndex("refTo"); - //htmlDocuments = crawlerDatabase.getCollection("HTMLDocuments"); - //time = crawlerDatabase.getCollection("time"); - - - } catch (UnknownHostException e) { - e.printStackTrace(); - } + this.mongoClient = new MongoClient("localhost", 27017); + //create database + this.crawlerDatabase = mongoClient.getDB("CrawlerDatabase"); + //create collections and fields + websites = crawlerDatabase.getCollection("websites"); + websites.createIndex("URL"); + websites.createIndex("crawled"); + websites.createIndex("indexed"); + websites.createIndex("rank"); + //websites.createIndex("hyberlinks"); + websites.createIndex("HTMLDocuments"); + websites.createIndex("Time"); + disallowedWebsite=crawlerDatabase.getCollection("DisallowedWebsites"); + //hyberlinks = crawlerDatabase.getCollection("hyberlinks"); + //hyberlinks.createIndex("URL"); + //hyberlinks.createIndex("refTo"); + //htmlDocuments = crawlerDatabase.getCollection("HTMLDocuments"); + //time = crawlerDatabase.getCollection("time"); } diff --git a/target/classes/com/company/Crawler/Database.class b/target/classes/com/company/Crawler/Database.class index 5e8dee7..3f6ae0a 100644 Binary files a/target/classes/com/company/Crawler/Database.class and b/target/classes/com/company/Crawler/Database.class differ