-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/bin/bash | ||
|
||
# Iterate over file pairs | ||
for i in {1..10}; do | ||
train_file="train-${i}.conllu" | ||
model="model-${i}.output" | ||
|
||
# Execute your command on each pair | ||
udpipe --train ${model} ${train_file} | ||
|
||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
# Author: Leonel Figueiredo de Alencar | ||
# Last update: August 25, 2024 | ||
|
||
import re | ||
import numpy as np | ||
import glob | ||
import os | ||
|
||
# Directory where your result files are located | ||
MYDIR="/home/leonel/Dropbox/nheengatu/pibic/test01/" | ||
RESULTS_DIR = '' | ||
|
||
def extract_las_values(data): | ||
""" | ||
Extracts LAS values from the provided data string. | ||
Args: | ||
data (str): Multi-line string containing LAS results. | ||
Returns: | ||
list: A list of LAS values as floats. | ||
""" | ||
# Regular expression to find LAS values | ||
las_matches = re.findall(r'LAS:\s([\d\.]+)%', data) | ||
|
||
# Convert matched strings to floats | ||
las_values = [float(value) for value in las_matches] | ||
|
||
return las_values | ||
|
||
def calculate_statistics(values): | ||
""" | ||
Calculates mean and standard deviation of a list of numbers. | ||
Args: | ||
values (list): List of numerical values. | ||
Returns: | ||
tuple: Mean and standard deviation rounded to two decimals. | ||
""" | ||
mean = np.mean(values) | ||
std_dev = np.std(values, ddof=1) # Using sample standard deviation | ||
|
||
return round(mean, 2), round(std_dev, 2) | ||
|
||
def main(): | ||
try: | ||
# Pattern to match all relevant files | ||
pattern = os.path.join(RESULTS_DIR, "gold-tok-tags-*.txt") | ||
|
||
# Find all files matching the pattern | ||
files = glob.glob(pattern) | ||
|
||
if not files: | ||
print("No files found matching the pattern.") | ||
return | ||
|
||
las_values = [] | ||
|
||
for file_path in files: | ||
with open(file_path, 'r') as file: | ||
data = file.read() | ||
las_values.extend(extract_las_values(data)) | ||
|
||
if not las_values: | ||
print("No LAS values found in the data.") | ||
return | ||
|
||
# Calculate statistics | ||
mean, std_dev = calculate_statistics(las_values) | ||
|
||
# Print results | ||
print(f"LAS Values: {las_values}") | ||
print(f"Mean LAS: {mean}%") | ||
print(f"Standard Deviation of LAS: {std_dev}%") | ||
|
||
except Exception as e: | ||
print(f"An error occurred: {e}") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
# Author: Leonel Figueiredo de Alencar | ||
# Last update: September 10, 2024 | ||
|
||
# split treebank file in ten folds | ||
TestSuite.py sample.conllu | ||
|
||
# create 10 different models (this may take up several hours) | ||
CreateModels.sh | ||
|
||
# Parse and test 10 times using each time a different model and test file | ||
TenFoldCrossVal.sh | ||
|
||
# Process the results of parsing with gold tokenization and gold tags | ||
ParseGoldTokResults.py | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
# Iterate over file pairs | ||
for i in {1..10}; do | ||
test_file="test-${i}.conllu" | ||
model="model-${i}.output" | ||
results="results-${i}.txt" | ||
gold="gold-tok-tags-${i}.txt" | ||
|
||
# Execute your command on each pair | ||
#udpipe --tokenize --tokenizer=ranges --accuracy --tag --parse ${model} ${test_file} > ${results} | ||
udpipe --accuracy --parse ${model} ${test_file} > ${gold} | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
# Author: Leonel Figueiredo de Alencar | ||
# Last update: August 24, 2024 | ||
|
||
import sys | ||
import numpy as np | ||
from AnnotateConllu import extractConlluSents, writeSentsConllu | ||
from conllu import TokenList | ||
from sklearn.model_selection import KFold | ||
|
||
|
||
def divide_treebank(treebank_data): | ||
# Calculate the size of each part | ||
part_size = len(treebank_data) // 10 | ||
|
||
# Divide the treebank into 10 parts | ||
treebank_parts = [treebank_data[i * part_size:(i + 1) * part_size] for i in range(10)] | ||
|
||
return treebank_parts | ||
|
||
|
||
def mkTestTrain(dataset): | ||
dataset = np.array(dataset, dtype=object) | ||
kf = KFold(n_splits=10, shuffle=True, random_state=42) | ||
i=1 | ||
for train_index, test_index in kf.split(dataset): | ||
# Split data into train and test sets | ||
train_data, test_data = dataset[train_index], dataset[test_index] | ||
writeSentsConllu(test_data,f"test-{i}.conllu") | ||
writeSentsConllu(train_data,f"train-{i}.conllu") | ||
i+=1 | ||
|
||
|
||
def main(): | ||
# Check if a filename is provided as a command line argument | ||
if len(sys.argv) != 2: | ||
print("Usage: python script.py <treebank_filename>") | ||
sys.exit(1) | ||
|
||
# Read treebank data from the file | ||
treebank_filename = sys.argv[1] | ||
treebank_data = extractConlluSents(treebank_filename) | ||
|
||
# write test and train files | ||
mkTestTrain(treebank_data) | ||
|
||
if __name__ == "__main__": | ||
main() |