Skip to content
This repository has been archived by the owner on Oct 8, 2019. It is now read-only.

Iterative training using distributed cache

myui edited this page Sep 9, 2014 · 1 revision

Note: this feature is supported since Hivemall v0.2 (Hive v0.11) or later.

1st iteration

use a9a;

set hivevar:total_steps=32561;

-- It would be better to use compressed table because file is distributed to all workers
SET hive.exec.compress.output=true;
-- SET mapred.output.compression.type=BLOCK;
-- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
-- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;

create table a9a_model1_snappy
as
select 
 cast(feature as int) as feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     logress(addBias(features),label,"-total_steps ${total_steps}") as (feature,weight)
  from 
     a9atrain
 ) t 
group by feature;

SET hive.exec.compress.output=false;
-- desc extended a9a_model1_snappy;

-- Copy a hive table on HDFS to distributed cache
-- Note that the table must be formatted in the Hive's default SequenceFile format
add file hdfs://dm01:8020/user/hive/warehouse/a9a.db/a9a_model1_snappy;

Files under a9a_model1_snappy is distributed to workers before lunching a job.

2nd iteration

set hivevar:modelfile=a9a_model1_snappy;
set hivevar:eta0=0.095;

create table a9a_model2_distcache
as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
      logress(
          addBias(features),
          label,
          "-total_steps ${total_steps} -eta0 ${eta0} -loadmodel ${modelfile}"
     ) as (feature,weight)
  from 
     a9atrain
 ) t 
group by feature;

By setting "-output_untouched" option, each trainer outputs weights not touched in training. This option is intended to use with "-loadmodel" option.

Evaluation

create or replace view a9a_predict2
as
select
  t.rowid, 
  sigmoid(sum(m.weight * t.value)) as prob,
  CAST((case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1.0 else 0.0 end) as FLOAT) as label
--  CAST((case when sum(m.weight * t.value) > 0.0 then 1.0 else 0.0 end) as FLOAT) as label
from 
  a9atest_exploded t LEFT OUTER JOIN
  a9a_model2_distcache m ON (t.feature = m.feature)
group by
  t.rowid;

create or replace view a9a_submit2 as
select 
  t.label as actual, 
  pd.label as predicted, 
  pd.prob as probability
from 
  a9atest t JOIN a9a_predict2 pd 
    on (t.rowid = pd.rowid);

set hivevar:num_test_instances=16281;
select count(1) / ${num_test_instances} from a9a_submit2
where actual == predicted;

0.8438056630428107 (accuracy is improved by iterations)

Clone this wiki locally