utils/get_vocab.py

# -*- coding:utf8 -*-
# ==============================================================================
# Copyright 2017 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Utility function to generate vocabulary file.
"""


import argparse
import sys
import json

from itertools import chain


def get_vocab(files, vocab_file):
    """
    Builds vocabulary file from field 'segmented_paragraphs'
    and 'segmented_question'.

    Args:
        files: A list of file names.
        vocab_file: The file that stores the vocabulary.
    """
    vocab = {}
    for f in files:
        with open(f, 'r') as fin:
            for line in fin:
                obj = json.loads(line.strip())
                paras = [
                        chain(*d['segmented_paragraphs'])
                        for d in obj['documents']]
                doc_tokens = chain(*paras)
                question_tokens = obj['segmented_question']
                for t in list(doc_tokens) + question_tokens:
                    vocab[t] = vocab.get(t, 0) + 1
    # output
    sorted_vocab = sorted([(v, c) for v, c in vocab.items()],
            key=lambda x: x[1],
            reverse=True)
    with open(vocab_file, 'w') as outf:
        for w, c in sorted_vocab:
            print >> outf, '{}\t{}'.format(w.encode('utf8'), c)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--files', nargs='+', required=True,
            help='file list to count vocab from.')
    parser.add_argument('--vocab', required=True,
            help='file to store counted vocab.')
    args = parser.parse_args()
    get_vocab(args.files, args.vocab)