Skip to content

Commit a6e25e2

Browse files
author
Tuan Vu
committed
Update notebooks/gcloud-example/github-trend-analysis.ipynb
1 parent 654082d commit a6e25e2

File tree

1 file changed

+290
-0
lines changed

1 file changed

+290
-0
lines changed

notebooks/gcloud-example/github-trend-analysis.ipynb

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,296 @@
10611061
"df.head(10)"
10621062
]
10631063
},
1064+
{
1065+
"cell_type": "markdown",
1066+
"metadata": {},
1067+
"source": [
1068+
"## Example Final table: GitHub on Hacker News Trends of 2018-12-01"
1069+
]
1070+
},
1071+
{
1072+
"cell_type": "code",
1073+
"execution_count": 20,
1074+
"metadata": {},
1075+
"outputs": [
1076+
{
1077+
"name": "stdout",
1078+
"output_type": "stream",
1079+
"text": [
1080+
"\n",
1081+
"WITH github_activity AS (\n",
1082+
"SELECT \n",
1083+
" repo.name as repo,\n",
1084+
" CONCAT('https://github.com/', repo.name) as url,\n",
1085+
" SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
1086+
" SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n",
1087+
" COUNT(*) AS cnt\n",
1088+
"FROM `githubarchive.day.20181201`\n",
1089+
"WHERE type IN ('WatchEvent','ForkEvent')\n",
1090+
"GROUP BY 1,2\n",
1091+
"),\n",
1092+
"hacker_news AS (\n",
1093+
"SELECT\n",
1094+
" EXTRACT(DATE FROM timestamp) as date,\n",
1095+
" `by` AS submitter,\n",
1096+
" id as story_id,\n",
1097+
" REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
1098+
" SUM(score) as score\n",
1099+
"FROM\n",
1100+
" `bigquery-public-data.hacker_news.full`\n",
1101+
"WHERE\n",
1102+
" type = 'story'\n",
1103+
" AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n",
1104+
" AND url LIKE '%https://github.com%'\n",
1105+
" AND url NOT LIKE '%github.com/blog/%'\n",
1106+
"GROUP BY 1,2,3,4\n",
1107+
")\n",
1108+
"\n",
1109+
"SELECT\n",
1110+
" a.date as date,\n",
1111+
" a.url as github_url,\n",
1112+
" b.repo as github_repo,\n",
1113+
" a.score as hn_score,\n",
1114+
" a.story_id as hn_story_id,\n",
1115+
" b.stars as stars,\n",
1116+
" b.forks as forks\n",
1117+
"FROM hacker_news as a\n",
1118+
"LEFT JOIN github_activity as b\n",
1119+
"ON a.url=b.url\n",
1120+
"ORDER BY hn_score DESC\n",
1121+
"LIMIT 10\n",
1122+
"\n"
1123+
]
1124+
},
1125+
{
1126+
"data": {
1127+
"text/html": [
1128+
"<div>\n",
1129+
"<style scoped>\n",
1130+
" .dataframe tbody tr th:only-of-type {\n",
1131+
" vertical-align: middle;\n",
1132+
" }\n",
1133+
"\n",
1134+
" .dataframe tbody tr th {\n",
1135+
" vertical-align: top;\n",
1136+
" }\n",
1137+
"\n",
1138+
" .dataframe thead th {\n",
1139+
" text-align: right;\n",
1140+
" }\n",
1141+
"</style>\n",
1142+
"<table border=\"1\" class=\"dataframe\">\n",
1143+
" <thead>\n",
1144+
" <tr style=\"text-align: right;\">\n",
1145+
" <th></th>\n",
1146+
" <th>date</th>\n",
1147+
" <th>github_url</th>\n",
1148+
" <th>github_repo</th>\n",
1149+
" <th>hn_score</th>\n",
1150+
" <th>hn_story_id</th>\n",
1151+
" <th>stars</th>\n",
1152+
" <th>forks</th>\n",
1153+
" </tr>\n",
1154+
" </thead>\n",
1155+
" <tbody>\n",
1156+
" <tr>\n",
1157+
" <th>0</th>\n",
1158+
" <td>2018-12-01</td>\n",
1159+
" <td>https://github.com/ithinco/i-am-chinese-the-dr...</td>\n",
1160+
" <td>ithinco/i-am-chinese-the-dragonfly-must-go-on</td>\n",
1161+
" <td>129</td>\n",
1162+
" <td>18574181</td>\n",
1163+
" <td>60.0</td>\n",
1164+
" <td>1.0</td>\n",
1165+
" </tr>\n",
1166+
" <tr>\n",
1167+
" <th>1</th>\n",
1168+
" <td>2018-12-01</td>\n",
1169+
" <td>https://github.com/YugaByte/yugabyte-db</td>\n",
1170+
" <td>YugaByte/yugabyte-db</td>\n",
1171+
" <td>115</td>\n",
1172+
" <td>18576170</td>\n",
1173+
" <td>2.0</td>\n",
1174+
" <td>NaN</td>\n",
1175+
" </tr>\n",
1176+
" <tr>\n",
1177+
" <th>2</th>\n",
1178+
" <td>2018-12-01</td>\n",
1179+
" <td>https://github.com/alertlogic/erllambda</td>\n",
1180+
" <td>alertlogic/erllambda</td>\n",
1181+
" <td>64</td>\n",
1182+
" <td>18574683</td>\n",
1183+
" <td>48.0</td>\n",
1184+
" <td>NaN</td>\n",
1185+
" </tr>\n",
1186+
" <tr>\n",
1187+
" <th>3</th>\n",
1188+
" <td>2018-12-01</td>\n",
1189+
" <td>https://github.com/oxplot/pdftilecut</td>\n",
1190+
" <td>oxplot/pdftilecut</td>\n",
1191+
" <td>64</td>\n",
1192+
" <td>18575094</td>\n",
1193+
" <td>91.0</td>\n",
1194+
" <td>NaN</td>\n",
1195+
" </tr>\n",
1196+
" <tr>\n",
1197+
" <th>4</th>\n",
1198+
" <td>2018-12-01</td>\n",
1199+
" <td>https://github.com/chocolatey/boxstarter</td>\n",
1200+
" <td>chocolatey/boxstarter</td>\n",
1201+
" <td>9</td>\n",
1202+
" <td>18575802</td>\n",
1203+
" <td>1.0</td>\n",
1204+
" <td>NaN</td>\n",
1205+
" </tr>\n",
1206+
" <tr>\n",
1207+
" <th>5</th>\n",
1208+
" <td>2018-12-01</td>\n",
1209+
" <td>https://github.com/devsnek/engine262</td>\n",
1210+
" <td>devsnek/engine262</td>\n",
1211+
" <td>8</td>\n",
1212+
" <td>18577658</td>\n",
1213+
" <td>1.0</td>\n",
1214+
" <td>NaN</td>\n",
1215+
" </tr>\n",
1216+
" <tr>\n",
1217+
" <th>6</th>\n",
1218+
" <td>2018-12-01</td>\n",
1219+
" <td>https://github.com/andrewchaa/functional.pipe</td>\n",
1220+
" <td>andrewchaa/functional.pipe</td>\n",
1221+
" <td>4</td>\n",
1222+
" <td>18574107</td>\n",
1223+
" <td>2.0</td>\n",
1224+
" <td>NaN</td>\n",
1225+
" </tr>\n",
1226+
" <tr>\n",
1227+
" <th>7</th>\n",
1228+
" <td>2018-12-01</td>\n",
1229+
" <td>https://github.com/anmonteiro/aws-lambda-ocaml...</td>\n",
1230+
" <td>anmonteiro/aws-lambda-ocaml-runtime</td>\n",
1231+
" <td>4</td>\n",
1232+
" <td>18578964</td>\n",
1233+
" <td>5.0</td>\n",
1234+
" <td>NaN</td>\n",
1235+
" </tr>\n",
1236+
" <tr>\n",
1237+
" <th>8</th>\n",
1238+
" <td>2018-12-01</td>\n",
1239+
" <td>https://github.com/KumarAbhirup/bulk-mail-cli</td>\n",
1240+
" <td>None</td>\n",
1241+
" <td>4</td>\n",
1242+
" <td>18577887</td>\n",
1243+
" <td>NaN</td>\n",
1244+
" <td>NaN</td>\n",
1245+
" </tr>\n",
1246+
" <tr>\n",
1247+
" <th>9</th>\n",
1248+
" <td>2018-12-01</td>\n",
1249+
" <td>https://github.com/jerverless/jerverless</td>\n",
1250+
" <td>None</td>\n",
1251+
" <td>4</td>\n",
1252+
" <td>18577036</td>\n",
1253+
" <td>NaN</td>\n",
1254+
" <td>NaN</td>\n",
1255+
" </tr>\n",
1256+
" </tbody>\n",
1257+
"</table>\n",
1258+
"</div>"
1259+
],
1260+
"text/plain": [
1261+
" date github_url \\\n",
1262+
"0 2018-12-01 https://github.com/ithinco/i-am-chinese-the-dr... \n",
1263+
"1 2018-12-01 https://github.com/YugaByte/yugabyte-db \n",
1264+
"2 2018-12-01 https://github.com/alertlogic/erllambda \n",
1265+
"3 2018-12-01 https://github.com/oxplot/pdftilecut \n",
1266+
"4 2018-12-01 https://github.com/chocolatey/boxstarter \n",
1267+
"5 2018-12-01 https://github.com/devsnek/engine262 \n",
1268+
"6 2018-12-01 https://github.com/andrewchaa/functional.pipe \n",
1269+
"7 2018-12-01 https://github.com/anmonteiro/aws-lambda-ocaml... \n",
1270+
"8 2018-12-01 https://github.com/KumarAbhirup/bulk-mail-cli \n",
1271+
"9 2018-12-01 https://github.com/jerverless/jerverless \n",
1272+
"\n",
1273+
" github_repo hn_score hn_story_id \\\n",
1274+
"0 ithinco/i-am-chinese-the-dragonfly-must-go-on 129 18574181 \n",
1275+
"1 YugaByte/yugabyte-db 115 18576170 \n",
1276+
"2 alertlogic/erllambda 64 18574683 \n",
1277+
"3 oxplot/pdftilecut 64 18575094 \n",
1278+
"4 chocolatey/boxstarter 9 18575802 \n",
1279+
"5 devsnek/engine262 8 18577658 \n",
1280+
"6 andrewchaa/functional.pipe 4 18574107 \n",
1281+
"7 anmonteiro/aws-lambda-ocaml-runtime 4 18578964 \n",
1282+
"8 None 4 18577887 \n",
1283+
"9 None 4 18577036 \n",
1284+
"\n",
1285+
" stars forks \n",
1286+
"0 60.0 1.0 \n",
1287+
"1 2.0 NaN \n",
1288+
"2 48.0 NaN \n",
1289+
"3 91.0 NaN \n",
1290+
"4 1.0 NaN \n",
1291+
"5 1.0 NaN \n",
1292+
"6 2.0 NaN \n",
1293+
"7 5.0 NaN \n",
1294+
"8 NaN NaN \n",
1295+
"9 NaN NaN "
1296+
]
1297+
},
1298+
"execution_count": 20,
1299+
"metadata": {},
1300+
"output_type": "execute_result"
1301+
}
1302+
],
1303+
"source": [
1304+
"query = \"\"\"\n",
1305+
"WITH github_activity AS (\n",
1306+
"SELECT \n",
1307+
" repo.name as repo,\n",
1308+
" CONCAT('https://github.com/', repo.name) as url,\n",
1309+
" SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
1310+
" SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n",
1311+
" COUNT(*) AS cnt\n",
1312+
"FROM `githubarchive.day.{0}`\n",
1313+
"WHERE type IN ('WatchEvent','ForkEvent')\n",
1314+
"GROUP BY 1,2\n",
1315+
"),\n",
1316+
"hacker_news AS (\n",
1317+
"SELECT\n",
1318+
" EXTRACT(DATE FROM timestamp) as date,\n",
1319+
" `by` AS submitter,\n",
1320+
" id as story_id,\n",
1321+
" REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
1322+
" SUM(score) as score\n",
1323+
"FROM\n",
1324+
" `bigquery-public-data.hacker_news.full`\n",
1325+
"WHERE\n",
1326+
" type = 'story'\n",
1327+
" AND EXTRACT(DATE FROM timestamp)='{1}' \n",
1328+
" AND url LIKE '%https://github.com%'\n",
1329+
" AND url NOT LIKE '%github.com/blog/%'\n",
1330+
"GROUP BY 1,2,3,4\n",
1331+
")\n",
1332+
"\n",
1333+
"SELECT\n",
1334+
" a.date as date,\n",
1335+
" a.url as github_url,\n",
1336+
" b.repo as github_repo,\n",
1337+
" a.score as hn_score,\n",
1338+
" a.story_id as hn_story_id,\n",
1339+
" b.stars as stars,\n",
1340+
" b.forks as forks\n",
1341+
"FROM hacker_news as a\n",
1342+
"LEFT JOIN github_activity as b\n",
1343+
"ON a.url=b.url\n",
1344+
"ORDER BY hn_score DESC\n",
1345+
"LIMIT 10\n",
1346+
"\"\"\".format(process_date_nodash, process_date)\n",
1347+
"\n",
1348+
"print (query)\n",
1349+
"\n",
1350+
"df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
1351+
"df.head(10)"
1352+
]
1353+
},
10641354
{
10651355
"cell_type": "markdown",
10661356
"metadata": {},

0 commit comments

Comments
 (0)