#556

CompLin · Sep 11, 2024 · 1a1bb44 · 1a1bb44
1 parent e7f80f0
commit 1a1bb44
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 0 deletions.
diff --git a/src/CreateModels.sh b/src/CreateModels.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Iterate over file pairs
+for i in {1..10}; do
+    train_file="train-${i}.conllu"
+    model="model-${i}.output"
+
+    # Execute your command on each pair
+    udpipe --train ${model} ${train_file}
+
+done
diff --git a/src/ParseGoldTokResults.py b/src/ParseGoldTokResults.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Author: Leonel Figueiredo de Alencar
+# Last update: August 25, 2024
+
+import re
+import numpy as np
+import glob
+import os
+
+# Directory where your result files are located
+MYDIR="/home/leonel/Dropbox/nheengatu/pibic/test01/"
+RESULTS_DIR = ''
+
+def extract_las_values(data):
+    """
+    Extracts LAS values from the provided data string.
+    
+    Args:
+        data (str): Multi-line string containing LAS results.
+    
+    Returns:
+        list: A list of LAS values as floats.
+    """
+    # Regular expression to find LAS values
+    las_matches = re.findall(r'LAS:\s([\d\.]+)%', data)
+
+    # Convert matched strings to floats
+    las_values = [float(value) for value in las_matches]
+
+    return las_values
+
+def calculate_statistics(values):
+    """
+    Calculates mean and standard deviation of a list of numbers.
+    
+    Args:
+        values (list): List of numerical values.
+    
+    Returns:
+        tuple: Mean and standard deviation rounded to two decimals.
+    """
+    mean = np.mean(values)
+    std_dev = np.std(values, ddof=1)  # Using sample standard deviation
+
+    return round(mean, 2), round(std_dev, 2)
+
+def main():
+    try:
+        # Pattern to match all relevant files
+        pattern = os.path.join(RESULTS_DIR, "gold-tok-tags-*.txt")
+
+        # Find all files matching the pattern
+        files = glob.glob(pattern)
+
+        if not files:
+            print("No files found matching the pattern.")
+            return
+
+        las_values = []
+
+        for file_path in files:
+            with open(file_path, 'r') as file:
+                data = file.read()
+                las_values.extend(extract_las_values(data))
+
+        if not las_values:
+            print("No LAS values found in the data.")
+            return
+
+        # Calculate statistics
+        mean, std_dev = calculate_statistics(las_values)
+
+        # Print results
+        print(f"LAS Values: {las_values}")
+        print(f"Mean LAS: {mean}%")
+        print(f"Standard Deviation of LAS: {std_dev}%")
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+if __name__ == '__main__':
+    main()
diff --git a/src/RunTenFoldCrossValidation.sh b/src/RunTenFoldCrossValidation.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Author: Leonel Figueiredo de Alencar
+# Last update: September 10, 2024
+
+# split treebank file in ten folds
+TestSuite.py sample.conllu
+
+# create 10 different models (this may take up several hours)
+CreateModels.sh
+
+# Parse and test 10 times using each time a different model and test file
+TenFoldCrossVal.sh
+
+# Process the results of parsing with gold tokenization and gold tags
+ParseGoldTokResults.py
+
diff --git a/src/TenFoldCrossVal.sh b/src/TenFoldCrossVal.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Iterate over file pairs
+for i in {1..10}; do
+    test_file="test-${i}.conllu"
+    model="model-${i}.output"
+    results="results-${i}.txt"
+    gold="gold-tok-tags-${i}.txt"
+
+    # Execute your command on each pair
+    #udpipe --tokenize --tokenizer=ranges --accuracy --tag --parse  ${model} ${test_file} > ${results}
+    udpipe --accuracy --parse  ${model} ${test_file} > ${gold}
+done
diff --git a/src/TestSuite.py b/src/TestSuite.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Author: Leonel Figueiredo de Alencar
+# Last update: August 24, 2024
+
+import sys
+import numpy as np
+from AnnotateConllu import extractConlluSents, writeSentsConllu
+from conllu import TokenList
+from sklearn.model_selection import KFold
+
+
+def divide_treebank(treebank_data):
+	# Calculate the size of each part
+	part_size = len(treebank_data) // 10
+
+	# Divide the treebank into 10 parts
+	treebank_parts = [treebank_data[i * part_size:(i + 1) * part_size] for i in range(10)]
+
+	return treebank_parts
+
+
+def mkTestTrain(dataset):
+	dataset = np.array(dataset, dtype=object)
+	kf = KFold(n_splits=10, shuffle=True, random_state=42)
+	i=1
+	for train_index, test_index in kf.split(dataset):
+		# Split data into train and test sets
+		train_data, test_data = dataset[train_index], dataset[test_index]
+		writeSentsConllu(test_data,f"test-{i}.conllu")
+		writeSentsConllu(train_data,f"train-{i}.conllu")
+		i+=1
+
+
+def main():
+    # Check if a filename is provided as a command line argument
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <treebank_filename>")
+        sys.exit(1)
+
+    # Read treebank data from the file
+    treebank_filename = sys.argv[1]
+    treebank_data = extractConlluSents(treebank_filename)
+
+    # write test and train files
+    mkTestTrain(treebank_data)
+
+if __name__ == "__main__":
+    main()