This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
Recommendation using Min wise LSH (minhash)
Makoto YUI edited this page Oct 5, 2016
·
29 revisions
List related (similar) articles for each article.
use news20;
delete jar /home/myui/tmp/hivemall.jar;
add jar /home/myui/tmp/hivemall.jar;
source /home/myui/tmp/define-all.hive;
set hivevar:hashes=100; -- Generate N sets of minhash values for each row (DEFAULT: 5)
set hivevar:keygroups=2; -- Use K minhash value for generating a resulting value (DEFAULT: 2)
create table news20_clusterid_assign
as
select
-- minhash(rowid, features) as (clusterId, rowid)
minhash(rowid, features, "-n ${hashes} -k ${keygroups}") as (clusterId, rowid)
from
news20mc_train;
--set hivevar:min_cluster_size=5;
create or replace view news20_cluster
as
select
clusterId,
collect_set(rowid) as rowids
from
news20_clusterid_assign
group by clusterId
-- having size(rowids) > ${min_cluster_size}
;
create table news20_similar_articles
as
WITH t1 as (
select
l.rowid,
r.rowid as other_id,
count(1) as cnt
from
news20_clusterid_assign l
LEFT OUTER JOIN
news20_clusterid_assign r
ON (l.clusterid = r.clusterid)
where
l.rowid != r.rowid
group by
l.rowid, r.rowid
having
-- 10/${hashes}=10/100=0.1 (filter by a pseudo Jaccard similarity by Minhash is greater than or equals to 0.1)
cnt >= 10
)
select
rowid,
collect_set(other_id) as related_articles
from
t1
group by
rowid
-- order by rowid asc
;
List all possible clusters w/o using a similarity threshold:
create table news20_similar_articles2
as
select
l.rowid,
collect_set(r.rowid) as related_articles
from
news20_clusterid_assign l
LEFT OUTER JOIN
news20_clusterid_assign r
ON (l.clusterid = r.clusterid)
where
l.rowid != r.rowid
group by
l.rowid
-- order by rowid asc
;
create table news20_jaccard_similarity
as
WITH t1 as (
select
l.rowid,
r.rowid as other_id,
count(1) / ${hashes} as similarity
from
news20_clusterid_assign l
JOIN news20_clusterid_assign r
ON (l.clusterid = r.clusterid)
where
l.rowid != r.rowid
group by
l.rowid, r.rowid
)
select
rowid,
other_id,
similarity,
1.0 - similarity as distance
from
t1
where
similarity >= 0.1
;