Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
leoalenc committed Sep 11, 2024
1 parent e7f80f0 commit 1a1bb44
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 0 deletions.
11 changes: 11 additions & 0 deletions src/CreateModels.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

# Iterate over file pairs
for i in {1..10}; do
train_file="train-${i}.conllu"
model="model-${i}.output"

# Execute your command on each pair
udpipe --train ${model} ${train_file}

done
83 changes: 83 additions & 0 deletions src/ParseGoldTokResults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Leonel Figueiredo de Alencar
# Last update: August 25, 2024

import re
import numpy as np
import glob
import os

# Directory where your result files are located
MYDIR="/home/leonel/Dropbox/nheengatu/pibic/test01/"
RESULTS_DIR = ''

def extract_las_values(data):
"""
Extracts LAS values from the provided data string.
Args:
data (str): Multi-line string containing LAS results.
Returns:
list: A list of LAS values as floats.
"""
# Regular expression to find LAS values
las_matches = re.findall(r'LAS:\s([\d\.]+)%', data)

# Convert matched strings to floats
las_values = [float(value) for value in las_matches]

return las_values

def calculate_statistics(values):
"""
Calculates mean and standard deviation of a list of numbers.
Args:
values (list): List of numerical values.
Returns:
tuple: Mean and standard deviation rounded to two decimals.
"""
mean = np.mean(values)
std_dev = np.std(values, ddof=1) # Using sample standard deviation

return round(mean, 2), round(std_dev, 2)

def main():
try:
# Pattern to match all relevant files
pattern = os.path.join(RESULTS_DIR, "gold-tok-tags-*.txt")

# Find all files matching the pattern
files = glob.glob(pattern)

if not files:
print("No files found matching the pattern.")
return

las_values = []

for file_path in files:
with open(file_path, 'r') as file:
data = file.read()
las_values.extend(extract_las_values(data))

if not las_values:
print("No LAS values found in the data.")
return

# Calculate statistics
mean, std_dev = calculate_statistics(las_values)

# Print results
print(f"LAS Values: {las_values}")
print(f"Mean LAS: {mean}%")
print(f"Standard Deviation of LAS: {std_dev}%")

except Exception as e:
print(f"An error occurred: {e}")

if __name__ == '__main__':
main()
16 changes: 16 additions & 0 deletions src/RunTenFoldCrossValidation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
# Author: Leonel Figueiredo de Alencar
# Last update: September 10, 2024

# split treebank file in ten folds
TestSuite.py sample.conllu

# create 10 different models (this may take up several hours)
CreateModels.sh

# Parse and test 10 times using each time a different model and test file
TenFoldCrossVal.sh

# Process the results of parsing with gold tokenization and gold tags
ParseGoldTokResults.py

13 changes: 13 additions & 0 deletions src/TenFoldCrossVal.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Iterate over file pairs
for i in {1..10}; do
test_file="test-${i}.conllu"
model="model-${i}.output"
results="results-${i}.txt"
gold="gold-tok-tags-${i}.txt"

# Execute your command on each pair
#udpipe --tokenize --tokenizer=ranges --accuracy --tag --parse ${model} ${test_file} > ${results}
udpipe --accuracy --parse ${model} ${test_file} > ${gold}
done
49 changes: 49 additions & 0 deletions src/TestSuite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Leonel Figueiredo de Alencar
# Last update: August 24, 2024

import sys
import numpy as np
from AnnotateConllu import extractConlluSents, writeSentsConllu
from conllu import TokenList
from sklearn.model_selection import KFold


def divide_treebank(treebank_data):
# Calculate the size of each part
part_size = len(treebank_data) // 10

# Divide the treebank into 10 parts
treebank_parts = [treebank_data[i * part_size:(i + 1) * part_size] for i in range(10)]

return treebank_parts


def mkTestTrain(dataset):
dataset = np.array(dataset, dtype=object)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
i=1
for train_index, test_index in kf.split(dataset):
# Split data into train and test sets
train_data, test_data = dataset[train_index], dataset[test_index]
writeSentsConllu(test_data,f"test-{i}.conllu")
writeSentsConllu(train_data,f"train-{i}.conllu")
i+=1


def main():
# Check if a filename is provided as a command line argument
if len(sys.argv) != 2:
print("Usage: python script.py <treebank_filename>")
sys.exit(1)

# Read treebank data from the file
treebank_filename = sys.argv[1]
treebank_data = extractConlluSents(treebank_filename)

# write test and train files
mkTestTrain(treebank_data)

if __name__ == "__main__":
main()

0 comments on commit 1a1bb44

Please sign in to comment.