Skip to content

Commit

Permalink
Add more checks
Browse files Browse the repository at this point in the history
  • Loading branch information
Basantloay committed May 26, 2021
1 parent e74fdaf commit 4fbc984
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 14 deletions.
Binary file modified out/production/Search_Engine/com/company/Crawler/Crawler.class
Binary file not shown.
Binary file modified out/production/Search_Engine/com/company/Main.class
Binary file not shown.
30 changes: 21 additions & 9 deletions src/com/company/Crawler/Crawler.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.company.Crawler;

import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
Expand All @@ -18,11 +19,12 @@
import java.util.*;

public class Crawler {
public String name="SENinja" ;
private static int max = 5000;
public static int crawlerNumber = 5;
private Set<String> seedSetVisited;
private Queue<String> seedSetVisited;
private Vector<Integer> removedItems;
private LinkedList<String> seedSet;
private Queue<String> seedSet;
private int ID;
AtomicInteger crawlerCount = new AtomicInteger();
private Date recrawlTime;
Expand All @@ -32,7 +34,7 @@ public class Crawler {
Vector<String> disallowed;
Vector<String> allowed;

public Crawler(int id, LinkedList<String> seedSet, Set<String> seedSetVisited, Vector<String> disallowed, Vector<String> allowed) {
public Crawler(int id, Queue<String> seedSet, LinkedList<String> seedSetVisited, Vector<String> disallowed, Vector<String> allowed) {
this.ID = id;
this.seedSet = seedSet;
this.seedSetVisited = seedSetVisited;
Expand Down Expand Up @@ -64,9 +66,12 @@ else if(find && cont)
{
if(line.contains("Disallow"))
{
int index=line.length()-1;
if(line.contains("*"))
index=line.indexOf("*");
//10 b3d disallow
disallowed.add(args + (line.substring(10)));
System.out.println(args + (line.substring(10)));
disallowed.add(args + (line.substring(10,index)+line.substring(index+1)));
System.out.println(args + (line.substring(10,index)+line.substring(index+1)));
}
else if(line.contains("Allow"))
{
Expand All @@ -91,8 +96,8 @@ else if(line.contains("Allow"))



public void parse(String args)
{
public void parse(String args) throws IOException {
FileWriter f1=new FileWriter("Test.txt");
int j=0;
try {
File file = new File(args);
Expand All @@ -104,12 +109,17 @@ public void parse(String args)
}
scannedFile.close();
Integer i=0;


while (!seedSet.isEmpty() || crawlerCount.intValue()<max) {
crawlerCount.incrementAndGet();
System.out.println(crawlerCount);
String website = seedSet.get(i);
String website = seedSet.remove();
f1.write(website+'\n');
robots(website, i);
Document doc = Jsoup.connect(website).get();
int timeout;
Document doc = Jsoup.connect(website).userAgent("Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2").method(Connection.Method.POST)
.timeout(0).ignoreHttpErrors(true).get();
Elements links = doc.select("a[href]");

i++;
Expand Down Expand Up @@ -137,13 +147,15 @@ public void parse(String args)
synchronized (seedSet) {
seedSet.remove(website);
}

}

} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
f1.close();
}


Expand Down
7 changes: 4 additions & 3 deletions src/com/company/Main.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
package com.company;
import com.company.Crawler.Crawler;

import java.io.IOException;
import java.util.*;

public class Main {

public static void main(String[] args) {
public static void main(String[] args) throws IOException {
LinkedList<String>l=new LinkedList<>() ;
Vector<String>disallowed=new Vector<String>();
//Map<String, Vector<String>> allowed=new HashMap<String, Vector<String>>();
Vector<String>allowed=new Vector<String>();
Set<String> seedSetVisited=new HashSet<String>();
Crawler c=new Crawler(0,l,seedSetVisited,disallowed,allowed);
Queue<String> seedSetVisited=new LinkedList<>();
Crawler c=new Crawler(0,l, (LinkedList<String>) seedSetVisited,disallowed,allowed);
c.parse("websites.txt");
}
}
5 changes: 3 additions & 2 deletions websites.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
https://www.pinterest.com
https://www.wikipedia.org
https://edition.cnn.com
https://www.linkedin.com
https://edition.cnn.com
https://www.g9g.com

0 comments on commit 4fbc984

Please sign in to comment.