-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit fc03656
Showing
1 changed file
with
225 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
<?php | ||
define("START_ID", 1); | ||
define("STOP_TED_QUERY",20); | ||
/** | ||
* this script will run as a cron job and will go over all pages | ||
* on TED http://www.ted.com/talks/view/id/ | ||
* from id 1 till there are no more pages | ||
* after this each page will be parsed and take the title and tags from the page | ||
* last step is to query youtube API to get the url of the movie and more details | ||
*/ | ||
|
||
//example to get movies from youtube API | ||
//https://gdata.youtube.com/feeds/api/videos?author=TEDtalksDirector&q=Questioning%20the%20universe%20&v=2&alt=json | ||
|
||
/** | ||
* function get a file using curl (fast) | ||
* @param $url - url which we want to get its content | ||
* @return the data of the file | ||
* @author XXXXX | ||
*/ | ||
function file_get_contents_curl($url) | ||
{ | ||
$ch = curl_init(); | ||
|
||
curl_setopt($ch, CURLOPT_HEADER, 0); | ||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | ||
curl_setopt($ch, CURLOPT_URL, $url); | ||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | ||
|
||
$data = curl_exec($ch); | ||
curl_close($ch); | ||
|
||
return $data; | ||
} | ||
|
||
/** | ||
* will represent a TED talk with all its details from the ted web site and from youTube | ||
* @author XXXXX | ||
* | ||
*/ | ||
class Talk{ | ||
|
||
var $id; | ||
var $name; | ||
var $tags = array(); | ||
var $youTubeID; | ||
var $label; | ||
var $src; | ||
var $viewCount; | ||
var $youTubeTitle; | ||
|
||
|
||
function setViewCount($count){ | ||
$this->viewCount=$count; | ||
} | ||
|
||
function getViewCount(){ | ||
return $this->viewCount; | ||
} | ||
|
||
function setYouTubeTitle($title){ | ||
$this->youTubeTitle=$title; | ||
} | ||
|
||
function getYouTubeTitle(){ | ||
return $this->youTubeTitle; | ||
} | ||
|
||
function setYouTubeID($ID){ | ||
$this->youTubeID=$ID; | ||
} | ||
|
||
function getYouTubeID(){ | ||
return $this->youTubeID; | ||
} | ||
|
||
function setLabel($Label){ | ||
$this->label=$Label; | ||
} | ||
|
||
function getLabel(){ | ||
return $this->label; | ||
|
||
} | ||
|
||
function setSrc($SRC){ | ||
$this->src=$SRC; | ||
} | ||
|
||
function getSrc(){ | ||
return $this->src; | ||
} | ||
function setID($ID){ | ||
$this->id=$ID; | ||
} | ||
|
||
function getID(){ | ||
return $this->id; | ||
} | ||
function setName($Name){ | ||
$this->name=$Name; | ||
} | ||
|
||
function getName(){ | ||
return $this->name; | ||
} | ||
|
||
function setTags($Tags){ | ||
$this->tags=$Tags; | ||
} | ||
function addTag($tag){ | ||
$this->tags[]=$tag; | ||
} | ||
|
||
function getTags(){ | ||
return $this->tags; | ||
} | ||
} | ||
|
||
//will hold all talks in array | ||
$tedTalks = array(); | ||
|
||
//id to start the query from | ||
$id=START_ID; | ||
|
||
//will indicate when needed to stop the query beacuse reached the end id's on TED website | ||
$endOFQuery=0; | ||
|
||
//get the time | ||
$time_start = microtime(true); | ||
|
||
//start the query on TED website | ||
//if we will query 20 pages in a row that do not exsist we will stop the querys and assume there are no more | ||
while ($endOFQuery < STOP_TED_QUERY){ | ||
|
||
//get the page of the talk | ||
$html = file_get_contents_curl("http://www.ted.com/talks/view/id/$id"); | ||
|
||
//parsing begins here: | ||
$doc = new DOMDocument(); | ||
@$doc->loadHTML($html); | ||
$nodes = $doc->getElementsByTagName('title'); | ||
|
||
//get and display what you need: | ||
$title = $nodes->item(0)->nodeValue; | ||
|
||
|
||
|
||
//check if this a valid page | ||
if (! strcmp ($title ,"TED | Talks")) | ||
//this is a removed ted talk or the end of the query so raise a flag (if we get anough of these in a row we will stop) | ||
$endOFQuery++; | ||
else { | ||
//this is a valid TED talk get its details | ||
|
||
//reset the flag for end of query | ||
$endOFQuery = 0; | ||
|
||
//get meta tags | ||
$metas = $doc->getElementsByTagName('meta'); | ||
|
||
//get the tag we need (keywords) | ||
for ($i = 0; $i < $metas->length; $i++) | ||
{ | ||
$meta = $metas->item($i); | ||
if($meta->getAttribute('name') == 'keywords') | ||
$keywords = $meta->getAttribute('content'); | ||
} | ||
|
||
//create new talk object and populate it | ||
$talk = new Talk(); | ||
//set its ted id from ted web site | ||
$talk->setID($id); | ||
//parse the name (name has un-needed char's in the end) | ||
$talk->setName( substr($title, 0, strpos( $title, '|')) ); | ||
|
||
//parse the String of tags to array | ||
$keywords = explode(",", $keywords); | ||
//remove un-needed items from it | ||
$keywords=array_diff($keywords, array("TED","Talks")); | ||
|
||
//add the filters tags to the talk | ||
$talk->setTags($keywords); | ||
|
||
//add to the total talks array | ||
$tedTalks[]=$talk; | ||
} | ||
|
||
//move to the next ted talk ID to query | ||
$id++; | ||
} //end of the while | ||
|
||
//going over all ted talk collected from ted web site and getting youTube details | ||
foreach ($tedTalks as $talk){ | ||
|
||
//prepare the query string with the TED TALK name as the search string | ||
$queryString = "https://gdata.youtube.com/feeds/api/videos?author=TEDtalksDirector&q=".$talk->getName()."&v=2&alt=json"; | ||
//remove all spaces to URL friendly string | ||
$queryString = str_replace(" ","%20",$queryString); | ||
//get details from API in json format | ||
$resultString = file_get_contents($queryString); | ||
|
||
//parse result string into workable json file | ||
$responseJson=json_decode($resultString,true); | ||
//get the first movie details | ||
$movieDetails = $responseJson['feed']['entry'][0]; | ||
//get all details (this section is for specific data i need you can skip this or change this to something else) | ||
$talk->setYouTubeID(substr($movieDetails['id']['$t'],strpos( $movieDetails['id']['$t'], 'video:')+6)); | ||
$talk->setLabel($movieDetails['category'][1]['label']); | ||
$talk->setYouTubeTitle($movieDetails['title']['$t']); | ||
$talk->setSrc($movieDetails['link'][0]['href']); | ||
$talk->setViewCount($movieDetails['yt$statistics']['viewCount']); | ||
|
||
//insert into DB if this movie has details from youtube and from TED site | ||
if ($youTubeID!=null){ | ||
//insert the talk into DB | ||
} | ||
|
||
} | ||
|
||
$time_end = microtime(true); | ||
$execution_time = ($time_end - $time_start); | ||
echo "this took (sec) : ".$execution_time; | ||
|
||
?> |