|
1061 | 1061 | "df.head(10)" |
1062 | 1062 | ] |
1063 | 1063 | }, |
| 1064 | + { |
| 1065 | + "cell_type": "markdown", |
| 1066 | + "metadata": {}, |
| 1067 | + "source": [ |
| 1068 | + "## Example Final table: GitHub on Hacker News Trends of 2018-12-01" |
| 1069 | + ] |
| 1070 | + }, |
| 1071 | + { |
| 1072 | + "cell_type": "code", |
| 1073 | + "execution_count": 20, |
| 1074 | + "metadata": {}, |
| 1075 | + "outputs": [ |
| 1076 | + { |
| 1077 | + "name": "stdout", |
| 1078 | + "output_type": "stream", |
| 1079 | + "text": [ |
| 1080 | + "\n", |
| 1081 | + "WITH github_activity AS (\n", |
| 1082 | + "SELECT \n", |
| 1083 | + " repo.name as repo,\n", |
| 1084 | + " CONCAT('https://github.com/', repo.name) as url,\n", |
| 1085 | + " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", |
| 1086 | + " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", |
| 1087 | + " COUNT(*) AS cnt\n", |
| 1088 | + "FROM `githubarchive.day.20181201`\n", |
| 1089 | + "WHERE type IN ('WatchEvent','ForkEvent')\n", |
| 1090 | + "GROUP BY 1,2\n", |
| 1091 | + "),\n", |
| 1092 | + "hacker_news AS (\n", |
| 1093 | + "SELECT\n", |
| 1094 | + " EXTRACT(DATE FROM timestamp) as date,\n", |
| 1095 | + " `by` AS submitter,\n", |
| 1096 | + " id as story_id,\n", |
| 1097 | + " REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n", |
| 1098 | + " SUM(score) as score\n", |
| 1099 | + "FROM\n", |
| 1100 | + " `bigquery-public-data.hacker_news.full`\n", |
| 1101 | + "WHERE\n", |
| 1102 | + " type = 'story'\n", |
| 1103 | + " AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n", |
| 1104 | + " AND url LIKE '%https://github.com%'\n", |
| 1105 | + " AND url NOT LIKE '%github.com/blog/%'\n", |
| 1106 | + "GROUP BY 1,2,3,4\n", |
| 1107 | + ")\n", |
| 1108 | + "\n", |
| 1109 | + "SELECT\n", |
| 1110 | + " a.date as date,\n", |
| 1111 | + " a.url as github_url,\n", |
| 1112 | + " b.repo as github_repo,\n", |
| 1113 | + " a.score as hn_score,\n", |
| 1114 | + " a.story_id as hn_story_id,\n", |
| 1115 | + " b.stars as stars,\n", |
| 1116 | + " b.forks as forks\n", |
| 1117 | + "FROM hacker_news as a\n", |
| 1118 | + "LEFT JOIN github_activity as b\n", |
| 1119 | + "ON a.url=b.url\n", |
| 1120 | + "ORDER BY hn_score DESC\n", |
| 1121 | + "LIMIT 10\n", |
| 1122 | + "\n" |
| 1123 | + ] |
| 1124 | + }, |
| 1125 | + { |
| 1126 | + "data": { |
| 1127 | + "text/html": [ |
| 1128 | + "<div>\n", |
| 1129 | + "<style scoped>\n", |
| 1130 | + " .dataframe tbody tr th:only-of-type {\n", |
| 1131 | + " vertical-align: middle;\n", |
| 1132 | + " }\n", |
| 1133 | + "\n", |
| 1134 | + " .dataframe tbody tr th {\n", |
| 1135 | + " vertical-align: top;\n", |
| 1136 | + " }\n", |
| 1137 | + "\n", |
| 1138 | + " .dataframe thead th {\n", |
| 1139 | + " text-align: right;\n", |
| 1140 | + " }\n", |
| 1141 | + "</style>\n", |
| 1142 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 1143 | + " <thead>\n", |
| 1144 | + " <tr style=\"text-align: right;\">\n", |
| 1145 | + " <th></th>\n", |
| 1146 | + " <th>date</th>\n", |
| 1147 | + " <th>github_url</th>\n", |
| 1148 | + " <th>github_repo</th>\n", |
| 1149 | + " <th>hn_score</th>\n", |
| 1150 | + " <th>hn_story_id</th>\n", |
| 1151 | + " <th>stars</th>\n", |
| 1152 | + " <th>forks</th>\n", |
| 1153 | + " </tr>\n", |
| 1154 | + " </thead>\n", |
| 1155 | + " <tbody>\n", |
| 1156 | + " <tr>\n", |
| 1157 | + " <th>0</th>\n", |
| 1158 | + " <td>2018-12-01</td>\n", |
| 1159 | + " <td>https://github.com/ithinco/i-am-chinese-the-dr...</td>\n", |
| 1160 | + " <td>ithinco/i-am-chinese-the-dragonfly-must-go-on</td>\n", |
| 1161 | + " <td>129</td>\n", |
| 1162 | + " <td>18574181</td>\n", |
| 1163 | + " <td>60.0</td>\n", |
| 1164 | + " <td>1.0</td>\n", |
| 1165 | + " </tr>\n", |
| 1166 | + " <tr>\n", |
| 1167 | + " <th>1</th>\n", |
| 1168 | + " <td>2018-12-01</td>\n", |
| 1169 | + " <td>https://github.com/YugaByte/yugabyte-db</td>\n", |
| 1170 | + " <td>YugaByte/yugabyte-db</td>\n", |
| 1171 | + " <td>115</td>\n", |
| 1172 | + " <td>18576170</td>\n", |
| 1173 | + " <td>2.0</td>\n", |
| 1174 | + " <td>NaN</td>\n", |
| 1175 | + " </tr>\n", |
| 1176 | + " <tr>\n", |
| 1177 | + " <th>2</th>\n", |
| 1178 | + " <td>2018-12-01</td>\n", |
| 1179 | + " <td>https://github.com/alertlogic/erllambda</td>\n", |
| 1180 | + " <td>alertlogic/erllambda</td>\n", |
| 1181 | + " <td>64</td>\n", |
| 1182 | + " <td>18574683</td>\n", |
| 1183 | + " <td>48.0</td>\n", |
| 1184 | + " <td>NaN</td>\n", |
| 1185 | + " </tr>\n", |
| 1186 | + " <tr>\n", |
| 1187 | + " <th>3</th>\n", |
| 1188 | + " <td>2018-12-01</td>\n", |
| 1189 | + " <td>https://github.com/oxplot/pdftilecut</td>\n", |
| 1190 | + " <td>oxplot/pdftilecut</td>\n", |
| 1191 | + " <td>64</td>\n", |
| 1192 | + " <td>18575094</td>\n", |
| 1193 | + " <td>91.0</td>\n", |
| 1194 | + " <td>NaN</td>\n", |
| 1195 | + " </tr>\n", |
| 1196 | + " <tr>\n", |
| 1197 | + " <th>4</th>\n", |
| 1198 | + " <td>2018-12-01</td>\n", |
| 1199 | + " <td>https://github.com/chocolatey/boxstarter</td>\n", |
| 1200 | + " <td>chocolatey/boxstarter</td>\n", |
| 1201 | + " <td>9</td>\n", |
| 1202 | + " <td>18575802</td>\n", |
| 1203 | + " <td>1.0</td>\n", |
| 1204 | + " <td>NaN</td>\n", |
| 1205 | + " </tr>\n", |
| 1206 | + " <tr>\n", |
| 1207 | + " <th>5</th>\n", |
| 1208 | + " <td>2018-12-01</td>\n", |
| 1209 | + " <td>https://github.com/devsnek/engine262</td>\n", |
| 1210 | + " <td>devsnek/engine262</td>\n", |
| 1211 | + " <td>8</td>\n", |
| 1212 | + " <td>18577658</td>\n", |
| 1213 | + " <td>1.0</td>\n", |
| 1214 | + " <td>NaN</td>\n", |
| 1215 | + " </tr>\n", |
| 1216 | + " <tr>\n", |
| 1217 | + " <th>6</th>\n", |
| 1218 | + " <td>2018-12-01</td>\n", |
| 1219 | + " <td>https://github.com/andrewchaa/functional.pipe</td>\n", |
| 1220 | + " <td>andrewchaa/functional.pipe</td>\n", |
| 1221 | + " <td>4</td>\n", |
| 1222 | + " <td>18574107</td>\n", |
| 1223 | + " <td>2.0</td>\n", |
| 1224 | + " <td>NaN</td>\n", |
| 1225 | + " </tr>\n", |
| 1226 | + " <tr>\n", |
| 1227 | + " <th>7</th>\n", |
| 1228 | + " <td>2018-12-01</td>\n", |
| 1229 | + " <td>https://github.com/anmonteiro/aws-lambda-ocaml...</td>\n", |
| 1230 | + " <td>anmonteiro/aws-lambda-ocaml-runtime</td>\n", |
| 1231 | + " <td>4</td>\n", |
| 1232 | + " <td>18578964</td>\n", |
| 1233 | + " <td>5.0</td>\n", |
| 1234 | + " <td>NaN</td>\n", |
| 1235 | + " </tr>\n", |
| 1236 | + " <tr>\n", |
| 1237 | + " <th>8</th>\n", |
| 1238 | + " <td>2018-12-01</td>\n", |
| 1239 | + " <td>https://github.com/KumarAbhirup/bulk-mail-cli</td>\n", |
| 1240 | + " <td>None</td>\n", |
| 1241 | + " <td>4</td>\n", |
| 1242 | + " <td>18577887</td>\n", |
| 1243 | + " <td>NaN</td>\n", |
| 1244 | + " <td>NaN</td>\n", |
| 1245 | + " </tr>\n", |
| 1246 | + " <tr>\n", |
| 1247 | + " <th>9</th>\n", |
| 1248 | + " <td>2018-12-01</td>\n", |
| 1249 | + " <td>https://github.com/jerverless/jerverless</td>\n", |
| 1250 | + " <td>None</td>\n", |
| 1251 | + " <td>4</td>\n", |
| 1252 | + " <td>18577036</td>\n", |
| 1253 | + " <td>NaN</td>\n", |
| 1254 | + " <td>NaN</td>\n", |
| 1255 | + " </tr>\n", |
| 1256 | + " </tbody>\n", |
| 1257 | + "</table>\n", |
| 1258 | + "</div>" |
| 1259 | + ], |
| 1260 | + "text/plain": [ |
| 1261 | + " date github_url \\\n", |
| 1262 | + "0 2018-12-01 https://github.com/ithinco/i-am-chinese-the-dr... \n", |
| 1263 | + "1 2018-12-01 https://github.com/YugaByte/yugabyte-db \n", |
| 1264 | + "2 2018-12-01 https://github.com/alertlogic/erllambda \n", |
| 1265 | + "3 2018-12-01 https://github.com/oxplot/pdftilecut \n", |
| 1266 | + "4 2018-12-01 https://github.com/chocolatey/boxstarter \n", |
| 1267 | + "5 2018-12-01 https://github.com/devsnek/engine262 \n", |
| 1268 | + "6 2018-12-01 https://github.com/andrewchaa/functional.pipe \n", |
| 1269 | + "7 2018-12-01 https://github.com/anmonteiro/aws-lambda-ocaml... \n", |
| 1270 | + "8 2018-12-01 https://github.com/KumarAbhirup/bulk-mail-cli \n", |
| 1271 | + "9 2018-12-01 https://github.com/jerverless/jerverless \n", |
| 1272 | + "\n", |
| 1273 | + " github_repo hn_score hn_story_id \\\n", |
| 1274 | + "0 ithinco/i-am-chinese-the-dragonfly-must-go-on 129 18574181 \n", |
| 1275 | + "1 YugaByte/yugabyte-db 115 18576170 \n", |
| 1276 | + "2 alertlogic/erllambda 64 18574683 \n", |
| 1277 | + "3 oxplot/pdftilecut 64 18575094 \n", |
| 1278 | + "4 chocolatey/boxstarter 9 18575802 \n", |
| 1279 | + "5 devsnek/engine262 8 18577658 \n", |
| 1280 | + "6 andrewchaa/functional.pipe 4 18574107 \n", |
| 1281 | + "7 anmonteiro/aws-lambda-ocaml-runtime 4 18578964 \n", |
| 1282 | + "8 None 4 18577887 \n", |
| 1283 | + "9 None 4 18577036 \n", |
| 1284 | + "\n", |
| 1285 | + " stars forks \n", |
| 1286 | + "0 60.0 1.0 \n", |
| 1287 | + "1 2.0 NaN \n", |
| 1288 | + "2 48.0 NaN \n", |
| 1289 | + "3 91.0 NaN \n", |
| 1290 | + "4 1.0 NaN \n", |
| 1291 | + "5 1.0 NaN \n", |
| 1292 | + "6 2.0 NaN \n", |
| 1293 | + "7 5.0 NaN \n", |
| 1294 | + "8 NaN NaN \n", |
| 1295 | + "9 NaN NaN " |
| 1296 | + ] |
| 1297 | + }, |
| 1298 | + "execution_count": 20, |
| 1299 | + "metadata": {}, |
| 1300 | + "output_type": "execute_result" |
| 1301 | + } |
| 1302 | + ], |
| 1303 | + "source": [ |
| 1304 | + "query = \"\"\"\n", |
| 1305 | + "WITH github_activity AS (\n", |
| 1306 | + "SELECT \n", |
| 1307 | + " repo.name as repo,\n", |
| 1308 | + " CONCAT('https://github.com/', repo.name) as url,\n", |
| 1309 | + " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", |
| 1310 | + " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", |
| 1311 | + " COUNT(*) AS cnt\n", |
| 1312 | + "FROM `githubarchive.day.{0}`\n", |
| 1313 | + "WHERE type IN ('WatchEvent','ForkEvent')\n", |
| 1314 | + "GROUP BY 1,2\n", |
| 1315 | + "),\n", |
| 1316 | + "hacker_news AS (\n", |
| 1317 | + "SELECT\n", |
| 1318 | + " EXTRACT(DATE FROM timestamp) as date,\n", |
| 1319 | + " `by` AS submitter,\n", |
| 1320 | + " id as story_id,\n", |
| 1321 | + " REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n", |
| 1322 | + " SUM(score) as score\n", |
| 1323 | + "FROM\n", |
| 1324 | + " `bigquery-public-data.hacker_news.full`\n", |
| 1325 | + "WHERE\n", |
| 1326 | + " type = 'story'\n", |
| 1327 | + " AND EXTRACT(DATE FROM timestamp)='{1}' \n", |
| 1328 | + " AND url LIKE '%https://github.com%'\n", |
| 1329 | + " AND url NOT LIKE '%github.com/blog/%'\n", |
| 1330 | + "GROUP BY 1,2,3,4\n", |
| 1331 | + ")\n", |
| 1332 | + "\n", |
| 1333 | + "SELECT\n", |
| 1334 | + " a.date as date,\n", |
| 1335 | + " a.url as github_url,\n", |
| 1336 | + " b.repo as github_repo,\n", |
| 1337 | + " a.score as hn_score,\n", |
| 1338 | + " a.story_id as hn_story_id,\n", |
| 1339 | + " b.stars as stars,\n", |
| 1340 | + " b.forks as forks\n", |
| 1341 | + "FROM hacker_news as a\n", |
| 1342 | + "LEFT JOIN github_activity as b\n", |
| 1343 | + "ON a.url=b.url\n", |
| 1344 | + "ORDER BY hn_score DESC\n", |
| 1345 | + "LIMIT 10\n", |
| 1346 | + "\"\"\".format(process_date_nodash, process_date)\n", |
| 1347 | + "\n", |
| 1348 | + "print (query)\n", |
| 1349 | + "\n", |
| 1350 | + "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", |
| 1351 | + "df.head(10)" |
| 1352 | + ] |
| 1353 | + }, |
1064 | 1354 | { |
1065 | 1355 | "cell_type": "markdown", |
1066 | 1356 | "metadata": {}, |
|
0 commit comments