Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
ba600cc
Add documentation for BPI2020 Domestic Declarations dataset and syste…
TataSatyaPratheek Mar 10, 2025
0f1d20f
Enhance GAT model with residual connections, batch normalization, and…
TataSatyaPratheek Mar 10, 2025
266f8da
Update requirements and README for enhanced installation instructions…
TataSatyaPratheek Mar 10, 2025
f2ceadb
Refactor main process and GAT model for improved validation reporting…
TataSatyaPratheek Mar 10, 2025
75f072c
Add new analysis papers and enhance LSTM/GAT model training with visu…
TataSatyaPratheek Mar 10, 2025
7e23d6a
Merge branch 'main' of https://github.com/TataSatyaPratheek/GNN
TataSatyaPratheek Mar 10, 2025
000a4a7
Remove redundant import statements in main.py and process_mining.py
TataSatyaPratheek Mar 10, 2025
9ce189e
Remove obsolete PDF and markdown files from paper analysis directory
TataSatyaPratheek Mar 10, 2025
71c629a
Add new PDF files and markdown documents for thesis statement and pro…
TataSatyaPratheek Mar 10, 2025
6d1ddcd
Updated with abalation study
TataSatyaPratheek Mar 10, 2025
a1e3155
fixed the issues with data processing
TataSatyaPratheek Mar 10, 2025
271abc4
Refactor data loading to use standard preprocessing directly, removin…
TataSatyaPratheek Mar 10, 2025
18d0038
Fix position calculations in EnhancedGNN and PositionalEncoding to en…
TataSatyaPratheek Mar 10, 2025
11428db
Enhance ablation study evaluation by adding graph-level target conver…
TataSatyaPratheek Mar 11, 2025
654d16e
Improve t-SNE and UMAP visualization by ensuring safe perplexity and …
TataSatyaPratheek Mar 11, 2025
d722d0c
debuged all the issues with dt, rf, xgboost
TataSatyaPratheek Mar 11, 2025
9c855ae
Add memory optimization utilities and ablation study script
TataSatyaPratheek Mar 11, 2025
ecd0175
Add initial implementation of process mining package with CLI, utilit…
TataSatyaPratheek Mar 11, 2025
5b218a7
Add base model interfaces and conformance checking utilities for proc…
TataSatyaPratheek Mar 11, 2025
9eea5c1
Add CLI modules for logging, device setup, and data processing pipeline
TataSatyaPratheek Mar 11, 2025
6a23fb7
Add centralized device management and checkpointing utilities; update…
TataSatyaPratheek Mar 11, 2025
b6bc857
simplified the package
TataSatyaPratheek Mar 11, 2025
f835b84
Add unit tests for process mining utilities to improve code coverage
TataSatyaPratheek Mar 11, 2025
3371843
Add module initializations and documentation for process mining compo…
TataSatyaPratheek Mar 12, 2025
d5e4b7e
Add error handling and logging for data processing pipeline
TataSatyaPratheek Mar 12, 2025
5269a09
Refactor data module imports and enhance graph data processing in LST…
TataSatyaPratheek Mar 12, 2025
bb42289
Refactor process mining utilities for improved readability and mainta…
TataSatyaPratheek Mar 12, 2025
e7fd874
Remove outdated heterogeneous graph building function and update docu…
TataSatyaPratheek Mar 12, 2025
8b003f7
Add model factory module and update core imports for enhanced model m…
TataSatyaPratheek Mar 12, 2025
746c68a
Refactor imports across multiple modules for improved organization an…
TataSatyaPratheek Mar 12, 2025
833da30
Add initial module structure and core functionality for process minin…
TataSatyaPratheek Mar 13, 2025
af9d281
Implement new data visualization features for enhanced user insights
TataSatyaPratheek Mar 13, 2025
eec418c
Refactor graph building imports and enhance synthetic data generation…
TataSatyaPratheek Mar 13, 2025
422c67a
Refactor import statements for improved module organization and consi…
TataSatyaPratheek Mar 13, 2025
568ea3c
Refactor CLI argument handling and enhance graph data building tests …
TataSatyaPratheek Mar 13, 2025
7731f8d
Update process mining toolkit to use DGL for graph operations and enh…
TataSatyaPratheek Mar 13, 2025
9ca9779
Add DGL as a core dependency and update data loading to use DGL's Gra…
TataSatyaPratheek Mar 13, 2025
0837a4e
Integrate DGL for graph-level target extraction and enhance evaluatio…
TataSatyaPratheek Mar 13, 2025
f5b3037
Enhance graph processing and visualization with DGL integration and o…
TataSatyaPratheek Mar 13, 2025
6668497
Refactor graph data processing to utilize DGL for improved feature ha…
TataSatyaPratheek Mar 13, 2025
900a388
Enhance DGL graph data handling with improved error checking and opti…
TataSatyaPratheek Mar 13, 2025
24a48d3
Implement additional optimizations for DGL graph processing and enhan…
TataSatyaPratheek Mar 13, 2025
5486b6e
Add memory management utilities for DGL data loading and processing
TataSatyaPratheek Mar 13, 2025
a801306
Enhance DGL sampling methods in training process and optimize Express…
TataSatyaPratheek Mar 13, 2025
0f871fc
Add DGL memory optimization and ablation study CLI integration
TataSatyaPratheek Mar 13, 2025
4c17348
Add ablation study CLI integration and enhance model evaluation utili…
TataSatyaPratheek Mar 13, 2025
fdcb88f
Implement optimizations for DGL graph processing and improve memory m…
TataSatyaPratheek Mar 13, 2025
82cc6e7
Refactor LSTM and RNN models to enhance memory efficiency and remove …
TataSatyaPratheek Mar 13, 2025
819d901
Refactor ExpressiveGATConv for improved dropout handling and add DGL …
TataSatyaPratheek Mar 13, 2025
88a975b
Refactor DGL integration in CLI and model training, enhance sampling …
TataSatyaPratheek Mar 13, 2025
702e21c
Refactor memory management in model training and enhance DGL integrat…
TataSatyaPratheek Mar 13, 2025
c417928
Add advanced workflow example and integrate ablation study functionality
TataSatyaPratheek Mar 13, 2025
ed9c6e4
Update DGL version, add ablation study scripts, and enhance preproces…
TataSatyaPratheek Mar 18, 2025
17c1ec8
Enhance GNN model architecture with additional layers, improve column…
TataSatyaPratheek Mar 18, 2025
da5f290
Refactor interactive process flow visualization for compatibility wit…
TataSatyaPratheek Mar 18, 2025
0df0718
Fix adaptive normalization to ensure proper type handling and improve…
TataSatyaPratheek Mar 18, 2025
06c7693
Add use_edge_features parameter to build_graph_data for enhanced grap…
TataSatyaPratheek Mar 18, 2025
4de1036
Enhance get_batch_graphs_from_indices to support NumPy array indices …
TataSatyaPratheek Mar 18, 2025
ec47711
Refactor stratified data splitting for graph classification tasks, en…
TataSatyaPratheek Mar 18, 2025
a052d4c
Refactor model training and prediction logic to handle tuple losses a…
TataSatyaPratheek Mar 18, 2025
9f73d87
Add loss display utility to handle various loss formats in training p…
TataSatyaPratheek Mar 18, 2025
7d5f6ad
Refactor model creation logic to standardize parameter handling and i…
TataSatyaPratheek Mar 18, 2025
b1d9957
Enhance model evaluation and factory logic to support diverse loss ty…
TataSatyaPratheek Mar 18, 2025
2b4479d
Add prediction method to LSTM model and enhance tensor dataloader for…
TataSatyaPratheek Mar 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 90 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Process Mining with Graph Neural Networks
# Process Mining with Graph Neural Networks

An advanced implementation combining Graph Neural Networks, Deep Learning, and Process Mining techniques for business process analysis and prediction.

Expand Down Expand Up @@ -68,7 +68,18 @@ git clone https://github.com/ERPdotAI/GNN.git
cd GNN
```

2. Install dependencies:
2. Create and activate a virtual environment:
```bash
# For Linux/macOS
python -m venv pm-venv
source pm-venv/bin/activate

# For Windows
python -m venv pm-venv
pm-venv\Scripts\activate
```

3. Install dependencies:
```bash
pip install -r requirements.txt
```
Expand All @@ -84,10 +95,36 @@ The system expects process event logs in CSV format with the following structure

## 8. Usage

### Basic Usage

```bash
python main.py <input-file-path>
```

For example:
```bash
python main.py input/BPI2020_DomesticDeclarations.csv
```

### Advanced Options

The script supports several command-line arguments:

```bash
python main.py input/BPI2020_DomesticDeclarations.csv --epochs 30 --batch-size 64 --norm-features
```

Available options:
- `--epochs`: Number of epochs for GNN training (default: 20)
- `--lstm-epochs`: Number of epochs for LSTM training (default: 5)
- `--batch-size`: Batch size for training (default: 32)
- `--norm-features`: Use L2 normalization for features
- `--skip-rl`: Skip reinforcement learning step
- `--skip-lstm`: Skip LSTM modeling step
- `--output-dir`: Custom output directory

### Output Structure

Results are stored in timestamped directories under `results/` with the following structure:
```
results/run_timestamp/
Expand All @@ -100,30 +137,74 @@ results/run_timestamp/

## 9. Technical Details

Graph Neural Network Architecture
### Graph Neural Network Architecture
- Multi-head attention mechanisms
- Dynamic graph construction
- Adaptive feature learning
- Custom loss functions for process-specific metrics

LSTM Implementation
### LSTM Implementation
- Bidirectional sequence modeling
- Variable-length sequence handling
- Custom embedding layer for process activities

Process Mining Components
### Process Mining Components
- Inductive miner implementation
- Token-based replay
- Custom conformance checking metrics
- Advanced bottleneck detection algorithms

Reinforcement Learning
### Reinforcement Learning
- Custom environment for process optimization
- State-action space modeling
- Policy gradient methods
- Resource allocation optimization

## 10. Contributing
### Visualization Capabilities
- Process flow network diagrams
- Bottleneck identification
- Transition heatmaps
- Interactive Sankey diagrams
- Cycle time distributions
- Task embedding visualizations

## 10. Troubleshooting

### Common Issues

1. **UMAP/Numba version incompatibility**

If you encounter an error like:
```
ImportError: Numba needs NumPy 2.1 or less. Got NumPy 2.2.
```

The code is designed to handle this gracefully by falling back to t-SNE for dimensionality reduction.

2. **PM4Py installation issues**

If PM4Py installation fails, you can use the code without conformance checking:
```bash
python main.py <input-file-path> --skip-conformance
```

3. **CUDA/GPU issues**

The code will automatically detect and use the appropriate device (CUDA, MPS, or CPU).
If you encounter GPU memory issues, try reducing the batch size:
```bash
python main.py <input-file-path> --batch-size 16
```

### Getting Help

If you encounter issues not covered above, please open an issue on the GitHub repository with:
- Full error message
- Python version
- OS details
- Dependencies list (output of `pip freeze`)

## 11. Contributing

We welcome contributions from the research community. Please follow these steps:

Expand All @@ -132,7 +213,7 @@ We welcome contributions from the research community. Please follow these steps:
3. Implement your changes
4. Submit a pull request with detailed documentation

## 11. Citation
## 12. Citation

If you use this code in your research, please cite:

Expand All @@ -144,4 +225,4 @@ If you use this code in your research, please cite:
publisher = {ERP.AI},
url = {https://github.com/ERPdotAI/GNN}
}
```
```
27 changes: 27 additions & 0 deletions ablation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
set -euo pipefail

# Original parameters unchanged
DATASET="input/BPI2020_DomesticDeclarations.csv"
OUTPUT_DIR="ablation_results"
LOG_DIR="${OUTPUT_DIR}/logs"
mkdir -p "$LOG_DIR"

# decision_tree random_forest xgboost
for MODEL in mlp lstm basic_gat positional_gat diverse_gat enhanced_gnn; do
echo "Running ablation for: $MODEL"

(
python -c "import torch; torch.cuda.empty_cache()" 2>/dev/null
python main.py $DATASET \
--run-ablation \
--model-type $MODEL \
--output-dir "${OUTPUT_DIR}/${MODEL}" \
--batch-size 32 \
--epochs 5 | tee "${LOG_DIR}/${MODEL}.log"
)

sleep 1
done

echo "All ablation studies completed"
73 changes: 73 additions & 0 deletions ablation_logs/fix_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Patched data preprocessing module to fix tuple handling issues
"""

import importlib.util
import sys
import os
from colorama import Fore, Style

# Load the main module
spec = importlib.util.spec_from_file_location("main", "main.py")
main = importlib.util.module_from_spec(spec)
sys.modules["main"] = main
spec.loader.exec_module(main)

# Fix load_and_preprocess_data_phase1 to properly handle tuple results
def patched_load_and_preprocess_data_phase1(data_path, args):
from modules.data_preprocessing import load_and_preprocess_data, create_feature_representation, build_graph_data

main.print_section_header("Loading and Preprocessing Data with Phase 1 Enhancements")

# Load and preprocess data
result = load_and_preprocess_data(
data_path,
use_adaptive_norm=args.adaptive_norm,
enhanced_features=args.enhanced_features,
enhanced_graphs=args.enhanced_graphs,
batch_size=args.batch_size
)

# Proper type checking with diagnostic output
if isinstance(result, tuple):
print(f"{Fore.YELLOW}Debug: load_and_preprocess_data returned tuple of length {len(result)}{Style.RESET_ALL}")

if len(result) == 4:
# Properly returns (df, graphs, task_encoder, resource_encoder)
return result
elif len(result) >= 1:
# Extract the dataframe from the first element if it's a dataframe
candidate_df = result[0]
if hasattr(candidate_df, 'columns'):
print(f"{Fore.GREEN}Debug: Successfully extracted dataframe from tuple[0]{Style.RESET_ALL}")
df = candidate_df
else:
print(f"{Fore.RED}Error: First element of tuple is not a dataframe{Style.RESET_ALL}")
df = result # Let it fail later with a clear error
else:
print(f"{Fore.RED}Error: Returned tuple is empty{Style.RESET_ALL}")
df = result # Let it fail later with a clear error
else:
# Just returns a dataframe or other object
df = result

# Process the dataframe normally
if hasattr(df, 'columns'):
# Create feature representation
df, task_encoder, resource_encoder = create_feature_representation(df, use_norm_features=args.adaptive_norm)
graphs = build_graph_data(df)
return df, graphs, task_encoder, resource_encoder
else:
print(f"{Fore.RED}Error: df is not a dataframe, it's a {type(df)}{Style.RESET_ALL}")
raise TypeError(f"Expected DataFrame, got {type(df)}")

# Apply our patch
main.load_and_preprocess_data_phase1 = patched_load_and_preprocess_data_phase1

# Run the main function with the arguments passed to this script
if __name__ == "__main__":
# Pass all arguments to main function
main.main()
24 changes: 24 additions & 0 deletions ablation_logs/run_ablation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
import sys
import os
import subprocess

# Get the command line arguments
args = sys.argv[1:]

# Print what we're going to run
print(f"Running: python main.py {' '.join(args)}")

# Run the command and capture output
try:
result = subprocess.run(['python', 'main.py'] + args,
check=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
print(result.stdout)
sys.exit(0)
except subprocess.CalledProcessError as e:
print(f"Error running main.py: {e}")
print(e.stdout)
sys.exit(e.returncode)
56 changes: 56 additions & 0 deletions clear_gpu_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import torch
import gc
import os
import psutil
import time
from colorama import Fore, Style, init

# Initialize colorama
init()

def clear_gpu_memory():

print(f"{Fore.CYAN}Clearing GPU memory...{Style.RESET_ALL}")

# Force garbage collection
gc.collect()

if torch.cuda.is_available():
# Get initial memory stats
initial_allocated = torch.cuda.memory_allocated() / (1024**2)
initial_reserved = torch.cuda.memory_reserved() / (1024**2)

print(f"Initial GPU memory: {initial_allocated:.1f} MB allocated, {initial_reserved:.1f} MB reserved")

# Empty cache
torch.cuda.empty_cache()

# Synchronize device
torch.cuda.synchronize()

# Get final memory stats
final_allocated = torch.cuda.memory_allocated() / (1024**2)
final_reserved = torch.cuda.memory_reserved() / (1024**2)

print(f"Final GPU memory: {final_allocated:.1f} MB allocated, {final_reserved:.1f} MB reserved")
print(f"Freed {initial_reserved - final_reserved:.1f} MB")
else:
print(f"{Fore.YELLOW}No GPU available{Style.RESET_ALL}")

# Also report CPU memory
cpu_memory = psutil.virtual_memory()
print(f"CPU memory: {cpu_memory.percent}% used, {cpu_memory.available / (1024**3):.2f} GB available")

if __name__ == "__main__":
clear_gpu_memory()

# Also kill any orphaned CUDA processes if on Linux
if os.name == 'posix':
try:
os.system("nvidia-smi | grep 'python' | awk '{print $3}' | xargs -r kill -9")
print(f"{Fore.GREEN}Killed orphaned CUDA processes{Style.RESET_ALL}")
except:
pass
Loading