race_utils.py

# coding=utf-8
# Copyright 2018 The Google AI Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for RACE dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import json
import os
from albert import classifier_utils
from albert import fine_tuning_utils
from albert import modeling
from albert import optimization
from albert import tokenization
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1 import estimator as tf_estimator
from tensorflow.contrib import tpu as contrib_tpu


class InputExample(object):
  """A single training/test example for the RACE dataset."""

  def __init__(self,
               example_id,
               context_sentence,
               start_ending,
               endings,
               label=None):
    self.example_id = example_id
    self.context_sentence = context_sentence
    self.start_ending = start_ending
    self.endings = endings
    self.label = label

  def __str__(self):
    return self.__repr__()

  def __repr__(self):
    l = [
        "id: {}".format(self.example_id),
        "context_sentence: {}".format(self.context_sentence),
        "start_ending: {}".format(self.start_ending),
        "ending_0: {}".format(self.endings[0]),
        "ending_1: {}".format(self.endings[1]),
        "ending_2: {}".format(self.endings[2]),
        "ending_3: {}".format(self.endings[3]),
    ]

    if self.label is not None:
      l.append("label: {}".format(self.label))

    return ", ".join(l)


class RaceProcessor(object):
  """Processor for the RACE data set."""

  def __init__(self, use_spm, do_lower_case, high_only, middle_only):
    super(RaceProcessor, self).__init__()
    self.use_spm = use_spm
    self.do_lower_case = do_lower_case
    self.high_only = high_only
    self.middle_only = middle_only

  def get_train_examples(self, data_dir):
    """Gets a collection of `InputExample`s for the train set."""
    return self.read_examples(
        os.path.join(data_dir, "RACE", "train"))

  def get_dev_examples(self, data_dir):
    """Gets a collection of `InputExample`s for the dev set."""
    return self.read_examples(
        os.path.join(data_dir, "RACE", "dev"))

  def get_test_examples(self, data_dir):
    """Gets a collection of `InputExample`s for prediction."""
    return self.read_examples(
        os.path.join(data_dir, "RACE", "test"))

  def get_labels(self):
    """Gets the list of labels for this data set."""
    return ["A", "B", "C", "D"]

  def process_text(self, text):
    if self.use_spm:
      return tokenization.preprocess_text(text, lower=self.do_lower_case)
    else:
      return tokenization.convert_to_unicode(text)

  def read_examples(self, data_dir):
    """Read examples from RACE json files."""
    examples = []
    for level in ["middle", "high"]:
      if level == "middle" and self.high_only: continue
      if level == "high" and self.middle_only: continue
      cur_dir = os.path.join(data_dir, level)

      cur_path = os.path.join(cur_dir, "all.txt")
      with tf.gfile.Open(cur_path) as f:
        for line in f:
          cur_data = json.loads(line.strip())

          answers = cur_data["answers"]
          options = cur_data["options"]
          questions = cur_data["questions"]
          context = self.process_text(cur_data["article"])

          for i in range(len(answers)):
            label = ord(answers[i]) - ord("A")
            qa_list = []

            question = self.process_text(questions[i])
            for j in range(4):
              option = self.process_text(options[i][j])

              if "_" in question:
                qa_cat = question.replace("_", option)
              else:
                qa_cat = " ".join([question, option])

              qa_list.append(qa_cat)

            examples.append(
                InputExample(
                    example_id=cur_data["id"],
                    context_sentence=context,
                    start_ending=None,
                    endings=[qa_list[0], qa_list[1], qa_list[2], qa_list[3]],
                    label=label
                )
            )

    return examples


def convert_single_example(example_index, example, label_size, max_seq_length,
                           tokenizer, max_qa_length):
  """Loads a data file into a list of `InputBatch`s."""

  # RACE is a multiple choice task. To perform this task using AlBERT,
  # we will use the formatting proposed in "Improving Language
  # Understanding by Generative Pre-Training" and suggested by
  # @jacobdevlin-google in this issue
  # https://github.com/google-research/bert/issues/38.
  #
  # Each choice will correspond to a sample on which we run the
  # inference. For a given RACE example, we will create the 4
  # following inputs:
  # - [CLS] context [SEP] choice_1 [SEP]
  # - [CLS] context [SEP] choice_2 [SEP]
  # - [CLS] context [SEP] choice_3 [SEP]
  # - [CLS] context [SEP] choice_4 [SEP]
  # The model will output a single value for each input. To get the
  # final decision of the model, we will run a softmax over these 4
  # outputs.
  if isinstance(example, classifier_utils.PaddingInputExample):
    return classifier_utils.InputFeatures(
        example_id=0,
        input_ids=[[0] * max_seq_length] * label_size,
        input_mask=[[0] * max_seq_length] * label_size,
        segment_ids=[[0] * max_seq_length] * label_size,
        label_id=0,
        is_real_example=False)
  else:
    context_tokens = tokenizer.tokenize(example.context_sentence)
    if example.start_ending is not None:
      start_ending_tokens = tokenizer.tokenize(example.start_ending)

    all_input_tokens = []
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    for ending in example.endings:
      # We create a copy of the context tokens in order to be
      # able to shrink it according to ending_tokens
      context_tokens_choice = context_tokens[:]
      if example.start_ending is not None:
        ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
      else:
        ending_tokens = tokenizer.tokenize(ending)
      # Modifies `context_tokens_choice` and `ending_tokens` in
      # place so that the total length is less than the
      # specified length.  Account for [CLS], [SEP], [SEP] with
      # "- 3"
      ending_tokens = ending_tokens[- max_qa_length:]

      if len(context_tokens_choice) + len(ending_tokens) > max_seq_length - 3:
        context_tokens_choice = context_tokens_choice[: (
            max_seq_length - 3 - len(ending_tokens))]
      tokens = ["[CLS]"] + context_tokens_choice + (
          ["[SEP]"] + ending_tokens + ["[SEP]"])
      segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (
          len(ending_tokens) + 1)

      input_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding = [0] * (max_seq_length - len(input_ids))
      input_ids += padding
      input_mask += padding
      segment_ids += padding

      assert len(input_ids) == max_seq_length
      assert len(input_mask) == max_seq_length
      assert len(segment_ids) == max_seq_length

      all_input_tokens.append(tokens)
      all_input_ids.append(input_ids)
      all_input_mask.append(input_mask)
      all_segment_ids.append(segment_ids)

    label = example.label
    if example_index < 5:
      tf.logging.info("*** Example ***")
      tf.logging.info("id: {}".format(example.example_id))
      for choice_idx, (tokens, input_ids, input_mask, segment_ids) in \
           enumerate(zip(all_input_tokens, all_input_ids, all_input_mask, all_segment_ids)):
        tf.logging.info("choice: {}".format(choice_idx))
        tf.logging.info("tokens: {}".format(" ".join(tokens)))
        tf.logging.info(
            "input_ids: {}".format(" ".join(map(str, input_ids))))
        tf.logging.info(
            "input_mask: {}".format(" ".join(map(str, input_mask))))
        tf.logging.info(
            "segment_ids: {}".format(" ".join(map(str, segment_ids))))
        tf.logging.info("label: {}".format(label))

    return classifier_utils.InputFeatures(
        example_id=example.example_id,
        input_ids=all_input_ids,
        input_mask=all_input_mask,
        segment_ids=all_segment_ids,
        label_id=label
    )


def file_based_convert_examples_to_features(
    examples, label_list, max_seq_length, tokenizer,
    output_file, max_qa_length):
  """Convert a set of `InputExample`s to a TFRecord file."""

  writer = tf.python_io.TFRecordWriter(output_file)

  for (ex_index, example) in enumerate(examples):
    if ex_index % 10000 == 0:
      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

    feature = convert_single_example(ex_index, example, len(label_list),
                                     max_seq_length, tokenizer, max_qa_length)

    def create_int_feature(values):
      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
      return f

    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(sum(feature.input_ids, []))
    features["input_mask"] = create_int_feature(sum(feature.input_mask, []))
    features["segment_ids"] = create_int_feature(sum(feature.segment_ids, []))
    features["label_ids"] = create_int_feature([feature.label_id])
    features["is_real_example"] = create_int_feature(
        [int(feature.is_real_example)])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
  writer.close()


def create_model(albert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings, max_seq_length,
                 dropout_prob, hub_module):
  """Creates a classification model."""
  bsz_per_core = tf.shape(input_ids)[0]

  input_ids = tf.reshape(input_ids, [bsz_per_core * num_labels, max_seq_length])
  input_mask = tf.reshape(input_mask,
                          [bsz_per_core * num_labels, max_seq_length])
  token_type_ids = tf.reshape(segment_ids,
                              [bsz_per_core * num_labels, max_seq_length])

  (output_layer, _) = fine_tuning_utils.create_albert(
      albert_config=albert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=token_type_ids,
      use_one_hot_embeddings=use_one_hot_embeddings,
      use_einsum=True,
      hub_module=hub_module)

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [1, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [1],
      initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(
          output_layer, keep_prob=1 - dropout_prob)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    logits = tf.reshape(logits, [bsz_per_core, num_labels])
    probabilities = tf.nn.softmax(logits, axis=-1)
    predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(
        labels, depth=tf.cast(num_labels, dtype=tf.int32), dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, probabilities, logits, predictions)


def model_fn_builder(albert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings, max_seq_length, dropout_prob,
                     hub_module):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf_estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, probabilities, logits, predictions) = \
        create_model(albert_config, is_training, input_ids, input_mask,
                     segment_ids, label_ids, num_labels,
                     use_one_hot_embeddings, max_seq_length, dropout_prob,
                     hub_module)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf_estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = contrib_tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf_estimator.ModeKeys.EVAL:
      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(
            labels=label_ids, predictions=predictions,
            weights=is_real_example)
        loss = tf.metrics.mean(
            values=per_example_loss, weights=is_real_example)
        return {
            "eval_accuracy": accuracy,
            "eval_loss": loss,
        }

      eval_metrics = (metric_fn,
                      [per_example_loss, label_ids, logits, is_real_example])
      output_spec = contrib_tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      output_spec = contrib_tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities,
                       "predictions": predictions},
          scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn