Skip to content

Commit 05af9f8

Browse files
authored
Update Prompts and Add Some Drivers (#10)
* update the list of prompts * add checkpointing * add throughput * update generation * add stack analysis script * update throughput scripts * add implementations for drivers that have been tested * update analysis scripts * updated set of currently working drivers * update scripts and prompts * minor updates * updates * add problem size option * output updates * update model outputs * updates across the board * update debug.sh * update scripts and fix formatting/bugs * fix bugs in drivers * update driver formatting * adding small time warning to output parser * update problem sizes for dense la * update problem sizes for sparse problems and fix bugs * set problem sizes for graph problems and fix bugs
1 parent 140af99 commit 05af9f8

File tree

575 files changed

+15913
-1031
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

575 files changed

+15913
-1031
lines changed

analysis/bin-the-stack.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
""" Count the number of times different parallel programming models are used
2+
in The Stack data set.
3+
See https://huggingface.co/datasets/bigcode/the-stack for more info on
4+
The Stack data set.
5+
author: Daniel Nichols
6+
date: December 2023
7+
"""
8+
# std imports
9+
from argparse import ArgumentParser
10+
from collections import Counter
11+
import json
12+
from multiprocessing import Pool
13+
from typing import List
14+
15+
# tpl imports
16+
from alive_progress import alive_it
17+
from datasets import load_dataset
18+
from numba import njit, jit
19+
from tqdm import tqdm
20+
21+
22+
""" io helper funtions """
23+
def write_json(obj: str, fpath: str):
24+
with open(fpath, 'w') as fp:
25+
json.dump(obj, fp)
26+
27+
def read_json(fpath: str):
28+
with open(fpath, 'r') as fp:
29+
return json.load(fp)
30+
31+
32+
""" Helper functions for checking model type """
33+
@njit
34+
def any_in(s: str, substrs: List[str]) -> bool:
35+
for substr in substrs:
36+
if substr in s:
37+
return True
38+
return False
39+
40+
@njit
41+
def uses_mpi(contents: str, language: str) -> bool:
42+
if language in ['C', 'C++']:
43+
return any_in(contents, ['MPI_', '#include <mpi.h>', '#include "mpi.h"'])
44+
elif language == 'FORTRAN':
45+
return any_in(contents, ['MPI_', 'include \'mpif.h\'', 'include "mpif.h"'])
46+
elif language == 'Python':
47+
return any_in(contents, ['mpi4py'])
48+
return False
49+
50+
@njit
51+
def uses_omp(contents: str, language: str) -> bool:
52+
if language in ['C', 'C++']:
53+
return any_in(contents, ['#pragma omp', '#include <omp.h>'])
54+
elif language == 'FORTRAN':
55+
return any_in(contents, ['!$OMP PARALLEL ', 'USE OMP_LIB'])
56+
return False
57+
58+
@njit
59+
def uses_kokkos(contents: str, language: str) -> bool:
60+
if language in ['C++']:
61+
return any_in(contents, ['#include <Kokkos_Core.hpp>', 'Kokkos::'])
62+
return False
63+
64+
@njit
65+
def uses_cuda(contents: str, language: str) -> bool:
66+
return language == 'Cuda'
67+
68+
def count_row(row):
69+
contents, language = row
70+
if language in ['C', 'C++', 'FORTRAN', 'Python', 'Cuda']:
71+
mpi = uses_mpi(contents, language)
72+
omp = uses_omp(contents, language)
73+
kokkos = uses_kokkos(contents, language)
74+
cuda = uses_cuda(contents, language)
75+
else:
76+
mpi, omp, kokkos, cuda = False, False, False, False
77+
return mpi, omp, kokkos, cuda, language
78+
79+
80+
#def count_models_parallel(batch, pool, chunksize=1000):
81+
# models = Counter()
82+
# languages = Counter()
83+
#
84+
# results = pool.imap_unordered(count_row, zip(batch['content'], batch['lang']), chunksize=chunksize)
85+
# for mpi, omp, kokkos, cuda, language in results:
86+
# models["mpi"] += 1 if mpi else 0
87+
# models["omp"] += 1 if omp else 0
88+
# models["kokkos"] += 1 if kokkos else 0
89+
# models["cuda"] += 1 if cuda else 0
90+
# languages[language] += 1
91+
# models["total"] += 1
92+
# languages["total"] += 1
93+
#
94+
# return models, languages
95+
96+
def count_models(batch):
97+
models = Counter()
98+
languages = Counter()
99+
100+
for contents, language in zip(batch['content'], batch['lang']):
101+
if language in ['C', 'C++', 'FORTRAN', 'Python', 'Cuda']:
102+
models["mpi"] += 1 if uses_mpi(contents, language) else 0
103+
models["omp"] += 1 if uses_omp(contents, language) else 0
104+
models["kokkos"] += 1 if uses_kokkos(contents, language) else 0
105+
models["cuda"] += 1 if uses_cuda(contents, language) else 0
106+
languages[language] += 1
107+
models["total"] += 1
108+
languages["total"] += 1
109+
110+
return models, languages
111+
112+
113+
""" Parse Args """
114+
parser = ArgumentParser(description=__doc__)
115+
parser.add_argument("-p", "--num_processes", type=int, help="number of processes")
116+
parser.add_argument("-c", "--chunk_size", type=int, help="chunk size for multiprocessing")
117+
parser.add_argument("-b", "--batch_size", type=int, default=1, help="batch size")
118+
parser.add_argument("--skip", type=int, help="skip first n batches")
119+
args = parser.parse_args()
120+
121+
""" Create the data set """
122+
print("Getting streaming data set...", flush=True)
123+
dataset = load_dataset("bigcode/the-stack", streaming=True, split="train")
124+
if args.skip:
125+
dataset = dataset.skip(args.skip * args.batch_size)
126+
counts = read_json(f'model-counts-{args.skip}.json')
127+
language_counts_json = read_json(f'language-counts-{args.skip}.json')
128+
dict.update(language_counts, language_counts_json)
129+
print("Got data set.", flush=True)
130+
131+
""" Compute new columns """
132+
print("Computing map...", flush=True)
133+
THE_STACK_ROWS = 545_547_422
134+
total_iter = THE_STACK_ROWS // args.batch_size + 1
135+
if args.skip:
136+
total_iter -= args.skip
137+
138+
counts, language_counts = Counter(), Counter()
139+
with Pool(processes=args.num_processes) as pool:
140+
chunksize = args.chunk_size if args.chunk_size else 1
141+
results = pool.imap(count_models, dataset.iter(batch_size=args.batch_size), chunksize=chunksize)
142+
143+
#for idx, batch in alive_it(enumerate(batches), total=total_iter):
144+
for idx, (c, lc) in tqdm(enumerate(results), total=total_iter):
145+
#c, lc = count_models(batch)
146+
counts.update(c)
147+
language_counts.update(lc)
148+
149+
if idx != 0 and idx % 150_000 == 0:
150+
offset = args.skip if args.skip else 0
151+
write_json(counts, f'model-counts-{idx + offset}.json')
152+
write_json(language_counts, f'language-counts-{idx + offset}.json')
153+
154+
print("Final counts:")
155+
print(counts)
156+
print(language_counts, flush=True)
157+
158+
write_json(counts, 'model-counts.json')
159+
write_json(language_counts, 'language-counts.json')

analysis/create-dataframe.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
""" Create a dataframe from the results of the run-all.py script.
2+
"""
3+
# std imports
4+
import argparse
5+
import json
6+
7+
# third party imports
8+
import pandas as pd
9+
10+
def get_args():
11+
parser = argparse.ArgumentParser(description=__doc__)
12+
parser.add_argument("input_json", type=str, help="Input JSON file containing the test cases.")
13+
parser.add_argument("-o", "--output", type=str, required=True, help="Output csv file containing the results.")
14+
return parser.parse_args()
15+
16+
def has_outputs(prompt: dict) -> bool:
17+
""" Check if a prompt has outputs """
18+
if "outputs" not in prompt:
19+
return False
20+
21+
if not isinstance(prompt["outputs"], list) or len(prompt["outputs"]) == 0:
22+
return False
23+
24+
if all(isinstance(o, str) for o in prompt["outputs"]):
25+
return False
26+
27+
return all(isinstance(o, dict) for o in prompt["outputs"])
28+
29+
30+
def check(df: pd.DataFrame):
31+
# check for (name, parallelism_model) pairs that have zero successful builds
32+
agg = df.groupby(["name", "parallelism_model"]).agg({"did_build": "sum"})
33+
agg = agg[agg["did_build"] == 0]
34+
if len(agg) > 0:
35+
print("The following (name, parallelism_model) pairs have zero successful builds:")
36+
print(agg)
37+
38+
def main():
39+
args = get_args()
40+
41+
# read in input
42+
with open(args.input_json, "r") as f:
43+
input_json = json.load(f)
44+
45+
# filter out entries without outputs
46+
input_json = list(filter(lambda x: has_outputs(x), input_json))
47+
48+
# parse out rows; each run becomes a row
49+
rows = []
50+
for prompt in input_json:
51+
for output_idx, output in enumerate(prompt["outputs"]):
52+
if output["runs"] is None:
53+
row = {
54+
"prompt": prompt["prompt"],
55+
"name": prompt["name"],
56+
"problem_type": prompt["problem_type"],
57+
"language": prompt["language"],
58+
"parallelism_model": prompt["parallelism_model"],
59+
"temperature": prompt["temperature"],
60+
"top_p": prompt["top_p"],
61+
"do_sample": prompt["do_sample"],
62+
"max_new_tokens": prompt["max_new_tokens"],
63+
"prompted": prompt["prompted"],
64+
"generated_output": output["generated_output"],
65+
"did_build": output["did_build"],
66+
"is_source_valid": output["is_source_valid"],
67+
"best_sequential_runtime": output["best_sequential_runtime"],
68+
"output_idx": output_idx
69+
}
70+
rows.append(row)
71+
continue
72+
73+
for run_idx, run in enumerate(output["runs"]):
74+
row = {
75+
"prompt": prompt["prompt"],
76+
"name": prompt["name"],
77+
"problem_type": prompt["problem_type"],
78+
"language": prompt["language"],
79+
"parallelism_model": prompt["parallelism_model"],
80+
"temperature": prompt["temperature"],
81+
"top_p": prompt["top_p"],
82+
"do_sample": prompt["do_sample"],
83+
"max_new_tokens": prompt["max_new_tokens"],
84+
"prompted": prompt["prompted"],
85+
"generated_output": output["generated_output"],
86+
"did_build": output["did_build"],
87+
"is_source_valid": output["is_source_valid"],
88+
"best_sequential_runtime": output["best_sequential_runtime"],
89+
"output_idx": output_idx,
90+
"run_idx": run_idx,
91+
**run
92+
}
93+
rows.append(row)
94+
95+
# create dataframe
96+
df = pd.DataFrame(rows)
97+
98+
# check for some possible data issues
99+
check(df)
100+
101+
# write to csv
102+
df.prompt = df.prompt.apply(lambda x: x.replace("\n", "\\n"))
103+
df.generated_output = df.generated_output.apply(lambda x: x.replace("\n", "\\n"))
104+
df.to_csv(args.output, index=False)
105+
106+
if __name__ == "__main__":
107+
main()

0 commit comments

Comments
 (0)