Skip to content

Commit

Permalink
ver 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
WONG JUN LIN authored and WONG JUN LIN committed Apr 1, 2013
1 parent 3fd96c7 commit 481517b
Showing 1 changed file with 293 additions and 0 deletions.
293 changes: 293 additions & 0 deletions client/client.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
#include <iostream>
#include <string>
#include <boost/regex.hpp>
#include <cstring>
#include <cstdlib>
#include <sys/types.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <netdb.h>
#include <sys/socket.h>
#include <deque>
#include <fstream>
#include <pthread.h>

#define FILE "url.txt"

using namespace std;

void some_function(string str)
{
cout << "str : " << str.c_str() <<endl;
}

string getHostUrl(string url);
string getLocationUrl(string url);
void* spider_thread ( void *args );

deque <string> all_links;
string hostAddress;
string locationAddress;
int *serverFD;

pthread_mutex_t lock;

int main ()
{
int client_sock;
struct sockaddr_in server_addr;
struct hostent *server_host;

/// Create new TCP socket.
if ( ( client_sock = socket ( AF_INET , SOCK_STREAM , 0 ) ) < 0 )
{
cerr << "Error creating Server socket\n";
return -1;
}

server_host = gethostbyname("127.0.0.1");

memset( &server_addr , 0 , sizeof ( struct sockaddr_in ) );
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons ( 5000 );
server_addr.sin_addr = *((struct in_addr *) server_host->h_addr);
bzero(&(server_addr.sin_zero),8);

/// Connect to server at port 5000
if (connect(client_sock, (struct sockaddr *)&server_addr, sizeof(struct sockaddr)) == -1)
{
cerr << "Error connecting socket";
return -1;
}

while (true)
{
/// receive url string from server

signed char ch;
//string url_from_server = "runningmanclub.blogspot.sg/index.html";

while ( recv( client_sock ,&ch, sizeof(ch), 0 ) == 1 )
{
url_from_server += ch;
}

cout << url_from_server << endl;

if (gethostbyname(getHostUrl(url_from_server).c_str()) != NULL)
{
/// divide url string into host and location url
string host, location;

hostAddress = getHostUrl(url_from_server);
locationAddress = getLocationUrl(url_from_server);
serverFD = &client_sock;

pthread_t threadID;

int sock;
int *sockFD = new int;

/// Create new TCP socket.
if ( ( sock = socket ( AF_INET , SOCK_STREAM , 0 ) ) < 0 )
{
cerr << "Error creating Server socket\n";
return -1;
}

*sockFD = sock;

fflush(stdout);

if (pthread_create(&threadID, NULL, spider_thread, sockFD) != 0)
{
cerr << "Error on pthread_create()\n";
close ( client_sock );
return -1;
}

}
else
{
/// Invalid url
}

sleep(3);
}

close (client_sock);

return 0;
}

/* This function takes in a URL address string and returns the host address URL. */
string getHostUrl(string full_addr)
{
string link = "";
size_t sp = full_addr.find_first_of( '/');

if ( sp != string::npos )
{
string host( full_addr.begin(), full_addr.begin()+sp );
string location( full_addr.begin()+sp, full_addr.end() );
link = host;
}
else
{
link = full_addr;
}

return link;
}

/* This function takes in a URL address string and returns the location address URL. */
string getLocationUrl(string full_addr)
{
string link = "";
size_t sp = full_addr.find_first_of( '/');

if ( sp != string::npos )
{
string location( full_addr.begin()+sp, full_addr.end() );
link = location;
}
else
{
link = "/";
}

return link;
}



/* Thread process */
void* spider_thread( void *args )
{
struct sockaddr_in server;
struct hostent *host_addr;

int *clientSocketFD = (int*) args;
int sockFD = *clientSocketFD;
delete clientSocketFD;

/// Bind to IP Address of this machine on port 80.
host_addr = gethostbyname(hostAddress.c_str());

memset ( &server , 0 , sizeof ( struct sockaddr_in ) );
server.sin_family = AF_INET;
server.sin_port = htons ( 80 );
server.sin_addr = *((struct in_addr *) host_addr->h_addr);
bzero(&(server.sin_zero),8);

if (connect(sockFD, (struct sockaddr *)&server, sizeof(struct sockaddr)) == -1)
{
cerr << "Error connecting\n";
close ( sockFD );
pthread_exit(NULL);
}

/// HTTP request message
string send_data = "GET " + locationAddress + " HTTP/1.1\r\nHost: " + hostAddress + "\r\n\r\n";
cout << send_data << endl;

if (send ( sockFD , send_data.c_str() , send_data.length() , 0 ) < 0)
{
cerr << "Error on send()\n";
close ( sockFD );
pthread_exit(NULL);
}

signed char ch;
string echo = "";

/// HTTP reply message
while ( recv( sockFD ,&ch, sizeof(ch), 0 ) == 1 )
{
echo += ch;
}

/// HTTP html body
unsigned pos = echo.find("\r\n\r\n");
string httpbody = echo.substr (pos+4);
pos = httpbody.find_first_of("\r\n");
string body = httpbody.substr(pos+2);

/// Extract all URLs from the HTTP response
string regex_pattern = "(http://|https://)([a-zA-Z0-9]+\.[a-zA-Z0-9\-]+|[a-zA-Z0-9\-]+)\.[a-zA-Z\.]{2,25}(/[a-zA-Z0-9\.\?=/#%&\+-]+|/|)";
boost::cmatch match;
boost::regex rgx (regex_pattern);

while (boost::regex_search (echo.c_str(),match,rgx))
{
string host_url = getHostUrl( match.str() );
string location_url = getLocationUrl( match.str() );

string new_url = host_url + location_url;

pthread_mutex_lock(&lock);

bool exist = false;
for (deque<string>::iterator it = all_links.begin(); it!=all_links.end(); ++it)
{
if (*it == new_url)
{
exist = true;
}
}

if (!exist)
{
all_links.push_back(new_url);
}

pthread_mutex_unlock(&lock);

echo = match.suffix().str();
}

/// Send data to server
// cout << "ALL LINKS" << endl;

// for (deque<string>::iterator it = all_links.begin(); it!=all_links.end(); ++it)
// {
// std::cout << *it << endl;
// }

int body_length = htonl(body.length());
int quantity = htonl(all_links.size());

cout << body.length() << endl;
cout << body_length << endl;
cout << endl;
cout << all_links.size() << endl;
cout << quantity << endl;

/// HTTP html page length
if (send(serverFD, (const char*)body_length, 4, 0) < 0)
{
cerr << "Error on send html page length\n";
}

/// HTTP html page
if (send(serverFD, body.c_str(), body.length(), 0) < 0)
{
cerr << "Error on send html page content\n";
}

/// List of URLs size
if (send(serverFD, (const char*)quantity, 4, 0) < 0)
{
cerr << "Error on send urls list size\n";
}

/// List of URLs
// if (send(serverFD, (const char*)quantity, 4, 0) < 0)
// {
// cerr << "Error on send urls list in string content\n";
// }

close (sockFD);

pthread_exit(NULL);
}

0 comments on commit 481517b

Please sign in to comment.