Iterative training using distributed cache

Note: this feature is supported since Hivemall v0.2 (Hive v0.11) or later.

1st iteration

use a9a;

set hivevar:total_steps=32561;

-- It would be better to use compressed table because file is distributed to all workers
SET hive.exec.compress.output=true;
-- SET mapred.output.compression.type=BLOCK;
-- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
-- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;

create table a9a_model1_snappy
as
select 
 cast(feature as int) as feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     logress(addBias(features),label,"-total_steps ${total_steps}") as (feature,weight)
  from 
     a9atrain
 ) t 
group by feature;

SET hive.exec.compress.output=false;

-- desc extended a9a_model1_snappy;

-- Copy a hive table on HDFS to distributed cache
-- Note that the table must be formatted in the Hive's default SequenceFile format
add file hdfs://dm01:8020/user/hive/warehouse/a9a.db/a9a_model1_snappy;

Files under a9a_model1_snappy is distributed to workers before lunching a job.

2nd iteration

set hivevar:modelfile=a9a_model1_snappy;
set hivevar:eta0=0.095;

create table a9a_model2_distcache
as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
      logress(
          addBias(features),
          label,
          "-total_steps ${total_steps} -eta0 ${eta0} -loadmodel ${modelfile}"
     ) as (feature,weight)
  from 
     a9atrain
 ) t 
group by feature;

By setting "-output_untouched" option, each trainer outputs weights not touched in training. This option is intended to use with "-loadmodel" option.

Evaluation

create or replace view a9a_predict2
as
select
  t.rowid, 
  sigmoid(sum(m.weight * t.value)) as prob,
  CAST((case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1.0 else 0.0 end) as FLOAT) as label
--  CAST((case when sum(m.weight * t.value) > 0.0 then 1.0 else 0.0 end) as FLOAT) as label
from 
  a9atest_exploded t LEFT OUTER JOIN
  a9a_model2_distcache m ON (t.feature = m.feature)
group by
  t.rowid;

create or replace view a9a_submit2 as
select 
  t.label as actual, 
  pd.label as predicted, 
  pd.prob as probability
from 
  a9atest t JOIN a9a_predict2 pd 
    on (t.rowid = pd.rowid);

set hivevar:num_test_instances=16281;
select count(1) / ${num_test_instances} from a9a_submit2
where actual == predicted;

0.8438056630428107 (accuracy is improved by iterations)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Iterative training using distributed cache

1st iteration

2nd iteration

Evaluation

Clone this wiki locally