-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_titles.sql
31 lines (30 loc) · 1.06 KB
/
build_titles.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
copy (
with
crew as (
select
tconst,
{ 'directors': str_split(directors,',') ,
'writers' : str_split(writers,',')
} as crew
from read_csv_auto('title.crew.tsv.gz', delim='\t', quote='',header=True)
),
ratings as (
SELECT tconst, ROW(averageRating, numVotes) as ratings
FROM read_csv_auto('title.ratings.tsv.gz', delim='\t', quote='',header=True)
),
titles as (
select *
REPLACE (
str_split(genres,',') as genres,
case WHEN regexp_matches(startYear,'[0-9]+') THEN CAST(startYear as integer) END as startYear,
case WHEN regexp_matches(endYear,'[0-9]+') THEN CAST(endYear as integer) END as endYear,
case WHEN regexp_matches(runtimeMinutes,'[0-9]+') THEN CAST(runtimeMinutes as integer) END as runtimeMinutes,
),
crew.crew,
ratings.ratings,
FROM read_csv_auto('title.basics.tsv.gz', delim='\t', quote='',header=True, all_varchar=true) as title
LEFT JOIN crew on title.tconst = crew.tconst
LEFT JOIN ratings on title.tconst = ratings.tconst
)
select * from titles
) to 'titles.parquet' (FORMAT 'parquet', CODEC 'ZSTD')