diff --git a/src/CreateModels.sh b/src/CreateModels.sh new file mode 100755 index 0000000..caa222a --- /dev/null +++ b/src/CreateModels.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Iterate over file pairs +for i in {1..10}; do + train_file="train-${i}.conllu" + model="model-${i}.output" + + # Execute your command on each pair + udpipe --train ${model} ${train_file} + +done diff --git a/src/ParseGoldTokResults.py b/src/ParseGoldTokResults.py new file mode 100755 index 0000000..8e177b1 --- /dev/null +++ b/src/ParseGoldTokResults.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Author: Leonel Figueiredo de Alencar +# Last update: August 25, 2024 + +import re +import numpy as np +import glob +import os + +# Directory where your result files are located +MYDIR="/home/leonel/Dropbox/nheengatu/pibic/test01/" +RESULTS_DIR = '' + +def extract_las_values(data): + """ + Extracts LAS values from the provided data string. + + Args: + data (str): Multi-line string containing LAS results. + + Returns: + list: A list of LAS values as floats. + """ + # Regular expression to find LAS values + las_matches = re.findall(r'LAS:\s([\d\.]+)%', data) + + # Convert matched strings to floats + las_values = [float(value) for value in las_matches] + + return las_values + +def calculate_statistics(values): + """ + Calculates mean and standard deviation of a list of numbers. + + Args: + values (list): List of numerical values. + + Returns: + tuple: Mean and standard deviation rounded to two decimals. + """ + mean = np.mean(values) + std_dev = np.std(values, ddof=1) # Using sample standard deviation + + return round(mean, 2), round(std_dev, 2) + +def main(): + try: + # Pattern to match all relevant files + pattern = os.path.join(RESULTS_DIR, "gold-tok-tags-*.txt") + + # Find all files matching the pattern + files = glob.glob(pattern) + + if not files: + print("No files found matching the pattern.") + return + + las_values = [] + + for file_path in files: + with open(file_path, 'r') as file: + data = file.read() + las_values.extend(extract_las_values(data)) + + if not las_values: + print("No LAS values found in the data.") + return + + # Calculate statistics + mean, std_dev = calculate_statistics(las_values) + + # Print results + print(f"LAS Values: {las_values}") + print(f"Mean LAS: {mean}%") + print(f"Standard Deviation of LAS: {std_dev}%") + + except Exception as e: + print(f"An error occurred: {e}") + +if __name__ == '__main__': + main() diff --git a/src/RunTenFoldCrossValidation.sh b/src/RunTenFoldCrossValidation.sh new file mode 100755 index 0000000..1241808 --- /dev/null +++ b/src/RunTenFoldCrossValidation.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Author: Leonel Figueiredo de Alencar +# Last update: September 10, 2024 + +# split treebank file in ten folds +TestSuite.py sample.conllu + +# create 10 different models (this may take up several hours) +CreateModels.sh + +# Parse and test 10 times using each time a different model and test file +TenFoldCrossVal.sh + +# Process the results of parsing with gold tokenization and gold tags +ParseGoldTokResults.py + diff --git a/src/TenFoldCrossVal.sh b/src/TenFoldCrossVal.sh new file mode 100755 index 0000000..fe5655a --- /dev/null +++ b/src/TenFoldCrossVal.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Iterate over file pairs +for i in {1..10}; do + test_file="test-${i}.conllu" + model="model-${i}.output" + results="results-${i}.txt" + gold="gold-tok-tags-${i}.txt" + + # Execute your command on each pair + #udpipe --tokenize --tokenizer=ranges --accuracy --tag --parse ${model} ${test_file} > ${results} + udpipe --accuracy --parse ${model} ${test_file} > ${gold} +done diff --git a/src/TestSuite.py b/src/TestSuite.py new file mode 100755 index 0000000..8a6a1fe --- /dev/null +++ b/src/TestSuite.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Author: Leonel Figueiredo de Alencar +# Last update: August 24, 2024 + +import sys +import numpy as np +from AnnotateConllu import extractConlluSents, writeSentsConllu +from conllu import TokenList +from sklearn.model_selection import KFold + + +def divide_treebank(treebank_data): + # Calculate the size of each part + part_size = len(treebank_data) // 10 + + # Divide the treebank into 10 parts + treebank_parts = [treebank_data[i * part_size:(i + 1) * part_size] for i in range(10)] + + return treebank_parts + + +def mkTestTrain(dataset): + dataset = np.array(dataset, dtype=object) + kf = KFold(n_splits=10, shuffle=True, random_state=42) + i=1 + for train_index, test_index in kf.split(dataset): + # Split data into train and test sets + train_data, test_data = dataset[train_index], dataset[test_index] + writeSentsConllu(test_data,f"test-{i}.conllu") + writeSentsConllu(train_data,f"train-{i}.conllu") + i+=1 + + +def main(): + # Check if a filename is provided as a command line argument + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + # Read treebank data from the file + treebank_filename = sys.argv[1] + treebank_data = extractConlluSents(treebank_filename) + + # write test and train files + mkTestTrain(treebank_data) + +if __name__ == "__main__": + main()