open-data-visualization/generate_multi_risk_cache.py at main · bettergovph/open-data-visualization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import pandas as pd
import duckdb
import json
import os
import sys
from pathlib import Path
from collections import defaultdict
import numpy as np
import scipy.stats as stats

# Configuration
DATA_DIR = Path('/home/joebert/open-data-visualization/data/parquet')
STATIC_DATA_DIR = Path('/home/joebert/open-data-visualization/static/data')
OUTPUT_FILE = STATIC_DATA_DIR / 'multi_risk_cache.json'
DATA_ROOT = STATIC_DATA_DIR # For consistency with copied logic if needed

def extract_condition_flags():
    """Extract flags from condition_risks_v2.json"""
    condition_file = STATIC_DATA_DIR / 'condition_risks_v2.json'
    flags = defaultdict(list)

    if not condition_file.exists():
        print(f"Warning: {condition_file} not found. Skipping condition flags.")
        return flags

    with open(condition_file, 'r') as f:
        data = json.load(f)

    for project in data.get('matches', []):
        name = project.get('project_name')
        if not name:
            continue

        remark = project.get('remark')
        if remark:
             # Standardizing flag names
             if "High Risk (No Match)" in remark:
                 flags[name].append("High Risk (No Match)")
             elif "High Risk (No Data)" in remark:
                 flags[name].append("High Risk (No Data)")
             elif "Highest Risk (Redundant)" in remark:
                 flags[name].append("Highest Risk (Redundant)")
             elif "Medium Risk (Unaddressed)" in remark:
                 flags[name].append("Medium Risk (Unaddressed)")

    return flags

def extract_repeated_flags():
    """Identify projects repeated across years (2020-2025)"""
    print("Extracting Repeated flags...")

    # 1. Source files (PATHS FROM visualization.py logic)
    parquet_2026 = DATA_DIR / "dpwh_2026_leaf_nodes.parquet"
    parquet_2025 = DATA_DIR / "budget_2025.parquet"
    parquet_2024 = DATA_DIR / "budget_2024.parquet"
    parquet_2023 = DATA_DIR / "budget_2023.parquet"
    parquet_2022 = DATA_DIR / "budget_2022.parquet"
    parquet_2021 = DATA_DIR / "budget_2021.parquet"
    parquet_2020 = DATA_DIR / "budget_2020.parquet"

    if not parquet_2026.exists():
         print("2026 Data file not found for repeated analysis.")
         return {}

    conn = duckdb.connect()

    # helper for query construction
    def get_path_str(p): return str(p).replace("'", "''")
    def check_exists(p): return p.exists()

    path_2026 = get_path_str(parquet_2026)

    # Blocklist Logic (Same as visualization.py)
    blocklist_clause = """
        AND value NOT ILIKE 'Central Office%'
        AND value NOT ILIKE 'GOP%'
        AND value NOT ILIKE 'Loan Proceeds%'
        AND value NOT ILIKE 'Region %'
        AND value NOT ILIKE '%District Engineering Office%'
        AND value NOT ILIKE '%DEO%'
        AND value NOT ILIKE '%Cordillera Administrative Region%'
        AND value NOT ILIKE '%National Capital Region%'
        AND value NOT ILIKE '%Public-Private Partnership%'
        AND value NOT ILIKE '%Management of Construction and Maintenance Equipment%'
        AND value NOT ILIKE '%Infrastructure Research, Quality Control and Management%'
        AND value NOT ILIKE '%Construction / Rehabilitation of Septage%'
        AND NOT (value ILIKE '%priority%' AND value ILIKE '%projects%')
    """

    # Query Construction
    years = [2025, 2024, 2023, 2022, 2021, 2020]
    ctes = []

    # Proj2026 CTE
    ctes.append(f"""
        Proj2026 AS (
            SELECT
                value as name,
                SUM(amount) as amount_2026
            FROM read_parquet('{path_2026}')
            WHERE amount IS NOT NULL AND amount > 0
            {blocklist_clause}
            GROUP BY value
        )
    """)

    select_years = []

    for year in years:
        p_path = DATA_DIR / f"budget_{year}.parquet"
        exists = p_path.exists()
        path_str = get_path_str(p_path)

        cte_sql = f"""
        History{year} AS (
            {'SELECT description as name, SUM(amount) * 1000 as amount_' + str(year) + ' FROM read_parquet(' + "'" + path_str + "'" + ') GROUP BY description' if exists else "SELECT '' as name, 0 as amount_" + str(year) + " WHERE 1=0"}
        )"""
        ctes.append(cte_sql)
        select_years.append(f"h{str(year)[-2:]}.amount_{year}")

    cte_part = ",\n".join(ctes)
    select_part = ",\n".join(select_years)
    join_part = "\n".join([f"LEFT JOIN History{year} h{str(year)[-2:]} ON p.name = h{str(year)[-2:]}.name" for year in years])
    where_part = " OR ".join([f"amount_{year} > 0" for year in years])

    query = f"""
        WITH {cte_part}
        SELECT
            p.name,
            {select_part}
        FROM Proj2026 p
        {join_part}
        WHERE ({where_part})
    """

    try:
        df = conn.execute(query).df()
        flags = defaultdict(list)

        for _, row in df.iterrows():
            name = row['name']
            repeated_years = []
            for year in years:
                col = f"amount_{year}"
                if col in row and row[col] > 0:
                    repeated_years.append(str(year))

            if repeated_years:
                flags[name].append(f"Repeated ({', '.join(sorted(repeated_years))})")

        return flags

    except Exception as e:
        print(f"Error extracting repeated flags: {e}")
        return {}

def extract_cost_outlier_flags():
    """Identify Cost Outliers based on unit cost statistics"""
    print("Extracting Cost Outlier flags...")

    # Simplified logic mirroring _calculate_dpwh_2026_stats
    # We will compute straightforward unit cost outliers based on grouping

    parquet_file = DATA_DIR / "dpwh_2026_leaf_nodes.parquet"
    if not parquet_file.exists():
        return {}

    conn = duckdb.connect()

    # Get all projects with some basic normalization
    query = f"""
        SELECT value as name, amount
        FROM read_parquet('{str(parquet_file).replace("'", "''")}')
        WHERE amount > 0
    """
    df = conn.execute(query).df()

    # We need to categorize loosely to find unit cost outliers.
    # Since we don't have the full parsing logic here easily without duplicating a massive amount of code,
    # we will rely on a simplified heuristic or try to infer "High Unit Cost" if we can extract magnitude.
    # However, strict statistical outlier detection requires grouping by 'Work Type'.

    # ALTERNATIVE: Access the existing logic via API? No, offline script.
    # Let's perform a generic statistical outlier detection on 'amount' for similar project names
    # OR better yet, let's extract flags if we can find keywords like "Convention Center", "Multi-Purpose Building"
    # and compare against known averages.

    # For now, to avoid excessive complexity in this script, let's look for explicitly labelled outliers
    # if we had a cache. We don't.

    # Let's implement a robust Z-score analysis on groupings derived from project keywords.

    flags = defaultdict(list)

    # Define simple categories by keyword
    categories = {
        'MPB': ['multi-purpose building', 'multipurpose building'],
        'Road Concreting': ['concreting of road'],
        'Flood Control': ['flood control', 'flood mitigation', 'revetment', 'dike'],
        'School Building': ['school building', 'classroom'],
        'Solar Water': ['solar water system', 'water supply'],
        'Street Light': ['solar street light', 'led street light']
    }

    df['category'] = 'Other'
    df['name_lower'] = df['name'].str.lower()

    for cat, keywords in categories.items():
        mask = df['name_lower'].apply(lambda x: any(k in x for k in keywords))
        df.loc[mask, 'category'] = cat

    # Calculate outliers per category
    for cat in categories.keys():
        cat_df = df[df['category'] == cat]
        if len(cat_df) < 5:
            continue

        # Log-transform amounts for better distribution normality
        amounts = cat_df['amount'].values
        log_amounts = np.log10(amounts)

        # Calculate Z-scores on log amounts
        z_scores = np.abs(stats.zscore(log_amounts))

        # Threshold: Z > 2 (approx top 2.5% or 5% depending on tail)
        outliers = cat_df[z_scores > 2]

        # High Unit Cost usually implies higher than expected amount for the category
        # But wait, amount is total cost, not unit cost. Unit cost requires a quantity (length, blocks, etc).
        # WITHOUT Quantity extraction (which is complex regex), we can only flag "Unusually High Total Cost for Category"
        # which is a weak proxy for Cost Outlier.

        # However, the user request says "flag in Statistics". The Statistics tab does specific unit cost analysis.
        # Let's try to do a best-effort simple parsing for length/units if possible, otherwise skip to stay safe.

        # REVISION: The user said "flag in Statistics". The current 'stats' tab does:
        # 1. Z-Score on Unit Cost (if quantity found)
        # 2. Z-Score on Total Cost (if no quantity)

        # We will apply High Total Cost outlier as a baseline flag.
        mean_amt = np.mean(amounts)
        std_amt = np.std(amounts)

        # Flag if amount is > mean + 2*dev
        threshold = mean_amt + 2 * std_amt

        for _, row in cat_df.iterrows():
            if row['amount'] > threshold:
                flags[row['name']].append(f"Cost Outlier (>{cat} Avg)")

    return flags

def generate_cache():
    print("Generating Multi-Risk Cache...")

    # 1. Gather Flags
    cond_flags = extract_condition_flags()
    rep_flags = extract_repeated_flags()
    cost_flags = extract_cost_outlier_flags()

    # 2. Extract Base Project List (All Unique Projects)
    # We can use the DuckDB to get a master list of project names and amounts
    parquet_2026 = DATA_DIR / "dpwh_2026_leaf_nodes.parquet"
    conn = duckdb.connect()
    p2026_path = str(parquet_2026).replace("'", "''")
    try:
        df_master = conn.execute(f"SELECT value as name, amount FROM read_parquet('{p2026_path}') WHERE amount > 0").df()
    except Exception as e:
        print(f"Error reading master file: {e}")
        return

    # 3. Aggregate
    # Map Name -> {amount, flags: []}
    # Handle duplicates by taking max amount (or sum? projects usually distinct line items)
    # We will key by name.

    project_map = {}

    for _, row in df_master.iterrows():
        name = row['name']
        amount = row['amount']

        # Initialize if new
        if name not in project_map:
            project_map[name] = {
                'project_name': name,
                'amount': amount,
                'flags': set()
            }
        else:
            # If duplicate name, keep max amount or just update (usually identical)
            pass

    # Attach flags
    for name, flags in cond_flags.items():
        if name in project_map:
            for f in flags: project_map[name]['flags'].add(f)

    for name, flags in rep_flags.items():
        if name in project_map:
            for f in flags: project_map[name]['flags'].add(f)

    for name, flags in cost_flags.items():
        if name in project_map:
            for f in flags: project_map[name]['flags'].add(f)

    # 4. Filter and Format
    # Only keep projects with at least one flag
    # However, user wants "Projects that are flag in Statistics, Repeated AND Condition"
    # Wait, "projects that are flag in Statistics, Repeated and Condition Tabs" - usually implies OR,
    # but "the more flags the higher on the list" implies showing all flagged ones.

    results = []
    for name, data in project_map.items():
        flags_list = sorted(list(data['flags']))
        if flags_list:
            results.append({
                'project_name': name,
                'amount': data['amount'],
                'flags': flags_list,
                'score': len(flags_list)
            })

    # 5. Sort
    # Score Descending, then Amount Descending
    results.sort(key=lambda x: (-x['score'], -x['amount']))

    print(f"Found {len(results)} flagged projects.")

    # 6. Save
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump({
            "success": True,
            "data": results,
            "total_flagged": len(results)
        }, f, indent=2)

    print(f"Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    generate_cache()