-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
58 lines (44 loc) · 1.64 KB
/
data_loader.py
File metadata and controls
58 lines (44 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
DEFAULT_DATA_PATH = 'data/bbc_text_cls.csv'
def load_data(data_path=DEFAULT_DATA_PATH):
"""
Loads the BBC News dataset for text classification.
Args:
data_path: Path to the data file
Returns:
tuple: (inputs, labels) - texts and labels
Raises:
FileNotFoundError: If the data file does not exist
ValueError: If the file is empty or missing required columns
"""
print(f"Loading data from {data_path}")
try:
df = pd.read_csv(data_path)
except FileNotFoundError:
raise FileNotFoundError(f"Data file not found: {data_path}")
except pd.errors.EmptyDataError:
raise ValueError(f"Data file is empty: {data_path}")
except Exception as e:
raise ValueError(f"Error reading data file: {e}")
# Check if DataFrame is empty
if df.empty:
raise ValueError(f"Loaded DataFrame is empty: {data_path}")
# Check required columns
required_columns = ['text', 'labels']
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
raise ValueError(
f"Missing required columns in data file: {missing_cols}"
)
# Check the data structure
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
# Separate features and labels
inputs = df['text']
labels = df['labels']
# Print class distribution info
print("\nLabel distribution:")
label_counts = labels.value_counts()
for label, count in label_counts.items():
print(f" {label}: {count} documents ({count/len(labels)*100:.1f}%)")
return inputs, labels