-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
29 lines (24 loc) · 3.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from dotenv import load_dotenv
from utils import *
from llm_dataset_gen import LLMDataset
def main():
load_dotenv()
# To create an empty dataset, use the create_dataset.py script
definitions = {
"symbol": "A Symbol (represented as SY) is a Noun or set of nouns acting as a substantial representation of the concept meant. The symbol stands for and represents the concept.",
"concept": "A Concept (represented as CO) is a Noun or set of nouns introducing new intrinsic or mutual properties.",
"incompleteness": "Incomplete (represented as INC) means that after having described a concept, no symbol is introduced or determined to refer to this concept.",
"meaningless": "Meaningless (represented as MLN) means that after having used a symbol, no definition of the symbol is given and the subsequent utterances make clear that the corresponding concept is missing or unclear.",
"redundancy": "Redundancy (represented as RDC) means that after having used a symbol for the reference of a concept, a different symbol is used for the reference of the same concept.",
"ambiguity": "Ambiguity (represented as AMB) means that after having used a symbol for the reference of a concept, a different concept can be detected, which is represented by the same symbol.",
}
# The LLMDatasetMgr Class automatically creates a dataset_description, which describes the column names for the given dataset so that the LLM can generate data in a format that matches the dataset
# However, you should also provide an additional context whenever calling add_data, which further describes the data that should be added to the dataset
dataset_context = f"For Context, the dataset represents requirements engineering excerpts and their corresponding Language Construct (LC) and Language Quality (LQ) codings, as described by a research paper titled 'Language quality in requirements development: tracing communication in the process of information systems development'. Language Construct (LC) represents the linguistic construction of a symbol-concept relationship that can be observed during the language development process. LC Codings may contain concepts (CO) and symbols (SY). {definitions['concept']} {definitions['symbol']} The LC Codings are represented by the 'SY1, SY2, CO1, and CO2' columns in the dataset. However, the generated data does not necessarily need to have 2 symbols and 2 concepts, and if that is the case, leave the corresponding entry empty. The dataset contains 4 types of language quality issues: incompleteness (INC), meaningless (MLN), redundancy (RDC), and ambiguity (AMB). For the generated data, ensure the Language Quality (LQ Code) is 'ambiguity'. {definitions['ambiguity']} Ensure each generated Excerpt is unique."
data_filepath = "./data/LC_Dataset3.csv"
dataset = LLMDataset(dataset_path=data_filepath)
dataset.print_attributes()
print(f"\nDataset Context: {dataset_context}")
dataset.add_data(context=dataset_context, num_samples=40)
if __name__ == "__main__":
main()