-
Notifications
You must be signed in to change notification settings - Fork 3
/
tedInfoGrab.php
225 lines (176 loc) · 5.33 KB
/
tedInfoGrab.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
<?php
define("START_ID", 1);
define("STOP_TED_QUERY",20);
/**
* this script will run as a crone job and will go over all pages
* on TED http://www.ted.com/talks/view/id/
* from id 1 till there are no more pages
* after this each page will be parsed and take the title and tags from the page
* last step is to query youtube API to get the url of the movie and more details
*/
//example to get movies from youtube API
//https://gdata.youtube.com/feeds/api/videos?author=TEDtalksDirector&q=Questioning%20the%20universe%20&v=2&alt=json
/**
* function get a file using curl (fast)
* @param $url - url which we want to get its content
* @return the data of the file
* @author XXXXX
*/
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
/**
* will represent a TED talk with all its details from the ted web site and from youTube
* @author XXXXX
*
*/
class Talk{
var $id;
var $name;
var $tags = array();
var $youTubeID;
var $label;
var $src;
var $viewCount;
var $youTubeTitle;
function setViewCount($count){
$this->viewCount=$count;
}
function getViewCount(){
return $this->viewCount;
}
function setYouTubeTitle($title){
$this->youTubeTitle=$title;
}
function getYouTubeTitle(){
return $this->youTubeTitle;
}
function setYouTubeID($ID){
$this->youTubeID=$ID;
}
function getYouTubeID(){
return $this->youTubeID;
}
function setLabel($Label){
$this->label=$Label;
}
function getLabel(){
return $this->label;
}
function setSrc($SRC){
$this->src=$SRC;
}
function getSrc(){
return $this->src;
}
function setID($ID){
$this->id=$ID;
}
function getID(){
return $this->id;
}
function setName($Name){
$this->name=$Name;
}
function getName(){
return $this->name;
}
function setTags($Tags){
$this->tags=$Tags;
}
function addTag($tag){
$this->tags[]=$tag;
}
function getTags(){
return $this->tags;
}
}
//will hold all talks in array
$tedTalks = array();
//id to start the query from
$id=START_ID;
//will indicate when needed to stop the query beacuse reached the end id's on TED website
$endOFQuery=0;
//get the time
$time_start = microtime(true);
//start the query on TED website
//if we will query 20 pages in a row that do not exsist we will stop the querys and assume there are no more
while ($endOFQuery < STOP_TED_QUERY){
//get the page of the talk
$html = file_get_contents_curl("http://www.ted.com/talks/view/id/$id");
//parsing begins here:
$doc = new DOMDocument();
@$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
//check if this a valid page
if (! strcmp ($title ,"TED | Talks"))
//this is a removed ted talk or the end of the query so raise a flag (if we get anough of these in a row we will stop)
$endOFQuery++;
else {
//this is a valid TED talk get its details
//reset the flag for end of query
$endOFQuery = 0;
//get meta tags
$metas = $doc->getElementsByTagName('meta');
//get the tag we need (keywords)
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
//create new talk object and populate it
$talk = new Talk();
//set its ted id from ted web site
$talk->setID($id);
//parse the name (name has un-needed char's in the end)
$talk->setName( substr($title, 0, strpos( $title, '|')) );
//parse the String of tags to array
$keywords = explode(",", $keywords);
//remove un-needed items from it
$keywords=array_diff($keywords, array("TED","Talks"));
//add the filters tags to the talk
$talk->setTags($keywords);
//add to the total talks array
$tedTalks[]=$talk;
}
//move to the next ted talk ID to query
$id++;
} //end of the while
//going over all ted talk collected from ted web site and getting youTube details
foreach ($tedTalks as $talk){
//prepare the query string with the TED TALK name as the search string
$queryString = "https://gdata.youtube.com/feeds/api/videos?author=TEDtalksDirector&q=".$talk->getName()."&v=2&alt=json";
//remove all spaces to URL friendly string
$queryString = str_replace(" ","%20",$queryString);
//get details from API in json format
$resultString = file_get_contents($queryString);
//parse result string into workable json file
$responseJson=json_decode($resultString,true);
//get the first movie details
$movieDetails = $responseJson['feed']['entry'][0];
//get all details (this section is for specific data i need you can skip this or change this to something else)
$talk->setYouTubeID(substr($movieDetails['id']['$t'],strpos( $movieDetails['id']['$t'], 'video:')+6));
$talk->setLabel($movieDetails['category'][1]['label']);
$talk->setYouTubeTitle($movieDetails['title']['$t']);
$talk->setSrc($movieDetails['link'][0]['href']);
$talk->setViewCount($movieDetails['yt$statistics']['viewCount']);
//insert into DB if this movie has details from youtube and from TED site
if ($youTubeID!=null){
//insert the talk into DB
}
}
$time_end = microtime(true);
$execution_time = ($time_end - $time_start);
echo "this took (sec) : ".$execution_time;
?>