-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathget_all_top_lines.sql
79 lines (79 loc) · 3.02 KB
/
get_all_top_lines.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
-- WARNING: THIS SCRIPT IS DATA INTENSIVE. IT CAN SIGNIFICANTLY CONTRIBUTE
-- TO YOUR FREE QUOTA USAGE. READ BELOW.
--
-- This script will fetch the entire 1.9TB of github content and group it into
-- table:
--
-- lines | extension | num_lines
--
-- where
-- * lines - actual line of code (appears in the original file with \n or \r at the end of the string
-- * extension - file extension where this line appeared
-- * num_lines - how many times this line was seen in the corpus
--
-- To avoid excessive data cost:
-- - each line is trimmed (spacebar and tabs are removed from the beginning and
-- the end of the line).
-- - The lines longer than 119 characters are ignored.
-- - The lines that appear less than 11 times in the corpus are ignored.
--
-- Even with these restrictions the script will process 1.9TB of data, and final
-- result will be ~12GB.
--
-- Make sure you run this script with `allow large results: true` and Query Priority: batch
SELECT
lines, -- actual line of code. Left/right trimmed, Lines longer than 120 characters are ignored
extension, -- file extension where this line appeared
COUNT(*) AS num_lines -- how many times this line was seen
FROM (
SELECT
-- it's common in programming languages to have whitespaces/tabs at the start/end
-- of the line. So we trim whitespaces. Note: LTRIM(line, ' \t') will trim both
-- ` ` and `\t` symbols
RTRIM(LTRIM(line, ' \t'), ' \t') lines,
extension
FROM (
SELECT
-- We split each line by \n or \r symbols. If line ends with \r\n it will
-- result in an empty string, which will be ignored in the outter select.
SPLIT(SPLIT(content, '\r'), '\n') line,
extension
FROM (
-- This will select all lines from all source codes, and will print their extension too
-- Files without extensions are ignored.
SELECT
c.id,
-- since we are grouping by `id` the content and extension should be the same.
-- Just pick first value:
FIRST(c.content) content,
FIRST(f.extension) extension,
FROM
[bigquery-public-data:github_repos.contents] c
JOIN (
-- Join `contents` table with `files` table to get files extensions.
SELECT
REGEXP_EXTRACT(path, r'.+\.(.*?)$') AS extension,
id
FROM
[bigquery-public-data:github_repos.files]
HAVING
extension IS NOT NULL ) f
ON
f.id = c.id
WHERE
c.binary IS FALSE -- we analyze text only
GROUP BY
1 ) ) )
WHERE
-- not interested in empty line.
lines != '' and
-- don't include long lines.The logic behind this is that if we have lines
-- that are too long, most likely they are not written by humans.
LENGTH(lines) < 120
GROUP EACH BY
lines,
extension
HAVING
-- to reduce size of the final dataset we completely ignore lines, that appear less than 10 time.
-- It shouldn't affect final results, since we have plenty of common lines in 1.9TB of code.
num_lines > 10