This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
Iterative training using distributed cache
myui edited this page Sep 9, 2014
·
1 revision
Note: this feature is supported since Hivemall v0.2 (Hive v0.11) or later.
use a9a;
set hivevar:total_steps=32561;
-- It would be better to use compressed table because file is distributed to all workers
SET hive.exec.compress.output=true;
-- SET mapred.output.compression.type=BLOCK;
-- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
-- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
create table a9a_model1_snappy
as
select
cast(feature as int) as feature,
cast(avg(weight) as float) as weight
from
(select
logress(addBias(features),label,"-total_steps ${total_steps}") as (feature,weight)
from
a9atrain
) t
group by feature;
SET hive.exec.compress.output=false;
-- desc extended a9a_model1_snappy;
-- Copy a hive table on HDFS to distributed cache
-- Note that the table must be formatted in the Hive's default SequenceFile format
add file hdfs://dm01:8020/user/hive/warehouse/a9a.db/a9a_model1_snappy;
Files under a9a_model1_snappy is distributed to workers before lunching a job.
set hivevar:modelfile=a9a_model1_snappy;
set hivevar:eta0=0.095;
create table a9a_model2_distcache
as
select
feature,
cast(avg(weight) as float) as weight
from
(select
logress(
addBias(features),
label,
"-total_steps ${total_steps} -eta0 ${eta0} -loadmodel ${modelfile}"
) as (feature,weight)
from
a9atrain
) t
group by feature;
By setting "-output_untouched" option, each trainer outputs weights not touched in training. This option is intended to use with "-loadmodel" option.
create or replace view a9a_predict2
as
select
t.rowid,
sigmoid(sum(m.weight * t.value)) as prob,
CAST((case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1.0 else 0.0 end) as FLOAT) as label
-- CAST((case when sum(m.weight * t.value) > 0.0 then 1.0 else 0.0 end) as FLOAT) as label
from
a9atest_exploded t LEFT OUTER JOIN
a9a_model2_distcache m ON (t.feature = m.feature)
group by
t.rowid;
create or replace view a9a_submit2 as
select
t.label as actual,
pd.label as predicted,
pd.prob as probability
from
a9atest t JOIN a9a_predict2 pd
on (t.rowid = pd.rowid);
set hivevar:num_test_instances=16281;
select count(1) / ${num_test_instances} from a9a_submit2
where actual == predicted;
0.8438056630428107 (accuracy is improved by iterations)