Initial commit

ufkhan97 · Mar 21, 2024 · 64daf24 · 64daf24
commit 64daf24
Show file tree

Hide file tree

Showing 6 changed files with 25,756 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/Zuzalu_Events_votes.csv b/Zuzalu_Events_votes.csv
diff --git a/__pycache__/fundingutils.cpython-310.pyc b/__pycache__/fundingutils.cpython-310.pyc
diff --git a/fundingutils.py b/fundingutils.py
@@ -0,0 +1,375 @@
+import pandas as pd
+from itertools import combinations
+from math import log
+from math import sqrt
+from math import floor
+import numpy as np
+import matplotlib.pyplot as plt
+from functools import reduce
+import time
+import json
+import os
+
+def add(x,y):
+  return x + y
+
+#
+#
+# definitions for a *bunch* of variations of QF
+#
+# in all of these functions,
+# - donation_df is expected to be a pandas dataframe where rows are wallets, columns are projects, and entries represent a wallet's total donation amount to a project
+# - cluster_df is expected to be a pandas dataframe where rows are wallets, columns are cluster, and entries are denote the strength of a user's membership in that cluster.
+#
+# also important to note: these functions all return the matching amounts each project should get under that variant of QF -- to get the full funding amount,
+# you need to add in the direct donations as well!
+#
+
+
+# first, some helper functions
+def binarize(df):
+  return df.applymap(lambda x: 1 if x > 0 else 0)
+
+def align(donation_df, cluster_df):
+  # first, drop users who haven't made any donations / aren't in any clusters
+  cluster_df.drop(cluster_df.index[cluster_df.apply(lambda row: all(row == 0), axis=1)],inplace=True)
+  donation_df.drop(donation_df.index[donation_df.apply(lambda row: all(row == 0), axis=1)],inplace=True)
+
+  # Also remove wallets that are just in one dataframe, but not the other
+  cluster_df.drop(set(cluster_df.index) - set(donation_df.index), inplace=True)
+  donation_df.drop(set(donation_df.index) - set(cluster_df.index), inplace=True)
+
+  #make sure the indices are sorted the same way (important for making sure the matrix multiplications work later)
+  cluster_df.sort_index(inplace=True)
+  donation_df.sort_index(inplace=True)
+
+  return donation_df, cluster_df
+
+def check_matching_cap(col, matching_cap_percent):
+    col = col.copy()
+    while True:
+        # Step 1: Identify the projects that have matching percentages exceeding the cap
+        over_cap = np.maximum(0, col - matching_cap_percent)
+        # Step 2: Set the matching percent to the cap percent for projects exceeding the cap
+        col.loc[col > matching_cap_percent] = matching_cap_percent
+        # Step 3: Calculate the total matching percent for projects not exceeding the cap
+        total_percent_for_not_capped = col[col < matching_cap_percent].sum()
+        # Step 4: If there isa  percentage available for redistribution, redistribute the excess percentage from over-capped projects proportionally
+        if total_percent_for_not_capped > 0:
+            remainder_percent = over_cap.sum() / total_percent_for_not_capped
+            col.loc[col < matching_cap_percent] *= (1 + remainder_percent)
+        else:
+            # If no percentage is available for redistribution, exit the loop
+            break
+        # Step 5: Check if the updates pushed any project over the cap, if not, exit the loop
+        over_cap_after_update = np.maximum(0, col - matching_cap_percent)
+        if not over_cap_after_update.sum() > 0:
+            break
+    # Return the updated project data
+    return col
+
+def scale_matching(funding, matching_cap_percent, matching_amount):
+    projects = list(funding.keys())
+    total_money = sum(funding.values())
+    funding_normalized = {p: funding[p]/total_money for p in projects} 
+    # Create DataFrame with 'project_name' and 'matching_amount' columns
+    result = pd.DataFrame(list(funding_normalized.items()), columns=['project_name', 'matching_amount'])
+    # Apply the cap to the 'matching_amount' column
+    result['matching_amount'] = check_matching_cap(result['matching_amount'], matching_cap_percent)
+    # Scale the 'matching_amount' column by the total matching amount
+    result['matching_amount'] = result['matching_amount'] * matching_amount
+    return result
+
+# now on to the QF variants
+
+def standard_qf(donation_df):
+  projects = donation_df.columns
+  funding = {p: (donation_df[p].apply(lambda x: sqrt(x)).sum() ** 2) - donation_df[p].sum() for p in projects}
+
+  return funding
+
+def pairwise(donation_df, M=0.01):
+
+  projects = donation_df.columns
+  donors = donation_df.index
+
+  # start off with funding = sum of individual donations, then add the pairwise matching amounts
+  #funding = {p: donation_df[p].sum() for p in projects}
+  funding = {p : 0 for p in projects}
+  sqrt_donation_df = donation_df.apply(lambda col: np.sqrt(col))
+
+  # The next line of code creates a matrix containing each pairwise coefficient k_i,j
+  # In-depth expanation:
+  # The dot product is a matrix multiplication that will give us a matrix where entry i,j is the dot product of
+  # i's square-rooted donation vector with j's square-rooted donation vector.
+  # Next, even though M is technically a scalar, pandas will automatically interpret the syntax "M + <matrix>"
+  # by assuming that M here refers to a matrix with M in every entry, and the same dimensions as the actual matrix
+  # on the other side of the +.
+  # Same goes for "M / <matrix>".
+  # The result is a matrix, "k_matrix", where entry i,j is the k_i,j described in the original pairwise matching blog post
+  k_matrix = M / (M + sqrt_donation_df.dot(sqrt_donation_df.transpose()))
+
+  proj_sets = {d : set([p for p in projects if donation_df.loc[d, p] > 0]) for d in donors}
+
+  for  wallet1, wallet2 in combinations(donors,2):
+    for p in proj_sets[wallet1].intersection(proj_sets[wallet2]):
+      funding[p] += sqrt_donation_df.loc[wallet1, p] * sqrt_donation_df.loc[wallet2, p] * k_matrix.loc[wallet1, wallet2]
+
+  return funding
+
+def cluster_profile_pairwise(donation_df, cluster_df):
+
+  cluster_df = binarize(cluster_df)
+
+  donation_df, cluster_df = align(donation_df, cluster_df)
+
+  projects = donation_df.columns
+  donors = donation_df.index
+  clusters = cluster_df.columns
+  cluster_members = cluster_df.index
+
+  # start off with funding = sum of individual donations, then add the pairwise matching amounts
+  #funding = {p: donation_df[p].sum() for p in projects}
+  funding = {p : 0 for p in projects}
+
+
+  # the pairwise matching coefficient for agents i and j is:
+  # (# groups just i is in + # groups just j is in) / (# groups i is in + # groups j is in)
+
+  # first, make a matrix whose entries are the numerators of the above formula for every pair of agents
+  # we make it by first setting each entry to be the total number of clusters, then subracting the clusters that both i and j are in,
+  # then subtracting the clusters that neither i nor j are in. We're left with the clusters that exactly one of i or j are in.
+  numerator_matrix = pd.DataFrame(index=donors, columns=donors, data=len(clusters)) - cluster_df.dot(cluster_df.transpose()) - ((1-cluster_df).dot(1-cluster_df.transpose()))
+
+
+  # now we make a matrix C representing the denominators of the above formula
+  # A is a vector where entry i is the number of groups i is in
+  A = cluster_df.apply(sum, axis=1)
+  # B is a matrix where every entry in row i is the number of groups i is in
+  B = pd.DataFrame(index=donors,columns=donors,data=[A]*len(donors))
+  # by adding B and its transpose, we get a matrix where entry (i,j) is the number of groups i is in + the number of groups j is in
+  denominator_matrix = B + B.transpose()
+  # finally, we can get the coefficient matrix by dividing the numerators by the denominators
+  coeffs = numerator_matrix / denominator_matrix
+
+
+  for p in projects:
+
+    non_donors = donation_df[donation_df[p] == 0].index
+
+    donor_only_donation_df = donation_df.drop(non_donors, axis=0)
+
+    donor_only_coeffs = coeffs.drop(non_donors, axis=1).drop(non_donors, axis=0)
+
+    y = donor_only_donation_df[p].apply(sqrt)
+    z = pd.DataFrame(y)
+    QF_matrix = z.dot(z.transpose())
+    funding[p] += (QF_matrix * donor_only_coeffs).sum().sum()
+
+  return funding
+
+def clustermatch(donation_df, cluster_df):
+
+  projects = donation_df.columns
+  clusters = cluster_df.columns
+  donors = donation_df.index
+  cluster_members = cluster_df.index
+
+  normalized_clusters = cluster_df.apply(lambda row: row / row.sum() if any(row) else 0, axis=1)
+
+  donation_df.drop(list(set(donors) - set(cluster_members)), inplace=True)
+  normalized_clusters.drop(list(set(cluster_members) - set(donors)), inplace=True)
+
+  normalized_clusters.sort_index(inplace=True)
+  donation_df.sort_index(inplace=True)
+
+  B = donation_df.transpose().dot(normalized_clusters)
+
+  # B should be a matrix where rows are projects, columns are clusters, and entry (i,j) is cluster j's donation to project i
+
+  funding = {p: B.loc[p].apply(lambda x: sqrt(x)).sum() ** 2 - B.loc[p].sum() for p in projects}
+  return funding
+
+def donation_profile_clustermatch(donation_df):
+  # run cluster match, using donation profiles as the clusters
+  # i.e., everyone who donated to the same set of projects gets put under the same square root.
+
+  # donation_df is expected to be a pandas Dataframe where rows are unique donors, columns are projects,
+  # and entry i,j denote user i's total donation to project j
+
+  # we'll store donation profiles as binary strings.
+  # i.e. say there are four projects total. if an agent donated to project 0, project 1, and project 3, they will be put in cluster "1101".
+  # here the indices 0,1,2,3 refer to the ordering in the input list of projects.
+
+  projects = donation_df.columns
+  don_profiles = donation_df.apply(lambda row: ''.join('1' if row[p] > 0 else '0' for p in projects), axis=1)
+
+  don_profile_df = pd.DataFrame(index=donation_df.index, columns=don_profiles.unique(), data=0)
+
+  for wallet in donation_df.index:
+    don_profile_df.loc[wallet, don_profiles[wallet]] = 1
+
+  return clustermatch(donation_df, don_profile_df)
+
+def COCM(donation_df, cluster_df, fancy=True):
+  # run CO-CM on a set of funding amounts and clusters
+  # if "fancy" is false, follow the formula in the whitepaper exactly. If "fancy" is true, get fancy with it.
+
+  # # first, drop users who haven't made any donations / aren't in any clusters
+  # cluster_df.drop(cluster_df.index[cluster_df.apply(lambda row: all(row == 0), axis=1)],inplace=True)
+  # donation_df.drop(donation_df.index[donation_df.apply(lambda row: all(row == 0), axis=1)],inplace=True)
+
+  # # Also remove wallets that are just in one dataframe, but not the other
+  # cluster_df.drop(set(cluster_df.index) - set(donation_df.index), inplace=True)
+  # donation_df.drop(set(donation_df.index) - set(cluster_df.index), inplace=True)
+
+  # #make sure the indices are sorted the same way (important for making sure the matrix multiplications work later)
+  # cluster_df.sort_index(inplace=True)
+  # donation_df.sort_index(inplace=True)
+
+  donation_df, cluster_df = align(donation_df, cluster_df)
+
+  projects = donation_df.columns
+  clusters = cluster_df.columns
+  donors = donation_df.index
+  cluster_members = cluster_df.index
+
+
+  # normalize the cluster dataframe so that rows sum to 1. Now, an entry tells us the "weight" that a particular cluster has for a particular user.
+  # if a user is in 0 clusters, their row will be a bunch of NaNs if we naively divide by 1.
+  # we shouldn't have any such users anyways, but just in case, we'll fill such a row with 0s instead
+  normalized_clusters = cluster_df.apply(lambda row: row / row.sum() if any(row) else 0, axis=1)
+
+  binarized_clusters = binarize(cluster_df)
+
+  if fancy:
+    # friendship_matrix is a matrix whose rows and columns are both wallets,
+    # and a value of 1 at index i,j means that wallets i and j are in at least one cluster together.
+    friendship_matrix = cluster_df.dot(cluster_df.transpose()).apply(lambda col: col > 0)
+
+    # k_indicators is a dataframe with wallets as rows and clusters as columns.
+    # if wallet i is not in cluster g, then entry i,g is is the fraction of i's friends who are in cluster g (i's friends are the agents i is in a shared cluster with).
+    # if wallet i is in cluster g, then entry i,g is 1.
+
+    # in the past, we used cluster_df in the following line instead of binarized_clusters
+    k_indicators = friendship_matrix.dot(binarized_clusters).apply(lambda row: row / friendship_matrix.loc[row.name].sum(), axis=1)
+    # ... and the following line used cluster_df instead of binarized_clusters
+    k_indicators = k_indicators.apply(lambda row: np.maximum(row, binarized_clusters.loc[row.name]), axis=1)
+
+  else:
+
+    # friendship_matrix is a matrix whose rows and columns are both wallets,
+    # and a value greater than 0 at index i,j means that wallets i and j are in at least one group together.
+    friendship_matrix = cluster_df.dot(cluster_df.transpose())
+
+    # k_indicators is a dataframe with wallets as rows and stamps as columns.
+    # entry i,g is True if wallet i is in a shared group with anyone from g, and False otherwise.
+    k_indicators = friendship_matrix.dot(cluster_df).apply(lambda col: col > 0)
+
+  # Create a dictionary to store funding amounts for each project.
+  # first we'll fund each project with the sum of donations to that project
+  # then we'll add in the pairwise matching amounts, which is the hard part.
+  #funding = {p: donation_df[p].sum() for p in projects}
+  funding = {p: 0 for p in projects}
+
+  for p in projects:
+    # get the actual k values for this project using contributions and indicators.
+
+    # C will be used to build the matrix of k values.
+    # It is a matrix where rows are wallets, columns are clusters, and the ith row of the matrix just has wallet i's contribution to the project in every entry.
+    C = pd.DataFrame(index=donors, columns = ['_'], data = donation_df[p].values).dot(pd.DataFrame(index= ['_'], columns = clusters, data=1))
+    # C is attained by taking the matrix multiplication of the column vector donation_df[p] (which is every agent's donation to project p) and a row vector with as many columns as projects, and a 1 in every entry
+    # the above line is so long mainly because you need to cast Pandas series' (i.e. vectors) as dataframes (i.e. matrices) for the matrix multiplication to work.
+
+    # now, K is a matrix where rows are wallets, columns are projects, and entry i,g ranges between c_i and sqrt(c_i) depending on i's relationship with cluster g and whether "fancy" was set to true or not.
+    K = (k_indicators * C.pow(1/2)) + ((1 - k_indicators) * C)
+
+
+    # Now we have all the k values, which are one of the items inside the innermost sum expressed in COCM.
+    # the other component of these sums is a division of each k value by the number of groups that user is in.
+    # P_prime is a matrix that combines k values and total group memberships to attain the value inside the aforementioned innermost sum.
+    # In other words, entry g,h of P_prime is:
+    #
+    #       sum_{i in g} K(i,h) / T_i
+    #
+    # where T_i is the total number of groups that i is in
+    P_prime = K.transpose().dot(normalized_clusters)
+
+    # Now, we can create P_prime, whose non-diagonal entries g,h represent the pairwise subsidy given to the pair of groups g and h.
+    P = (P_prime * P_prime.transpose()).pow(1/2)
+
+    # The diagonal entries of P are not relevant, so get rid of them. We only care about the pairwise subsidies between distinct groups.
+    np.fill_diagonal(P.values, 0)
+
+    # Now the sum of every entry in P is the amount of subsidy funding COCM awards to the project.
+    funding[p] += P.sum().sum()
+
+
+  return funding
+
+def standard_donation(donation_df):
+  # just do a normal vote (nothing quadratic)
+  projects = donation_df.columns
+  funding = {p: donation_df[p].sum() for p in projects}
+  return funding
+
+def apply_sliding_scale(votes_data):
+    # Define the score range and corresponding scaling factors
+    score_range = np.array([15, 25])
+    scale_range = np.array([0.5, 1.0])
+
+    # Apply the scaling factor to the 'amountUSD' column based on the 'score' column
+    if 'starting_amountUSD' not in votes_data.columns:
+      votes_data['starting_amountUSD'] = votes_data['amountUSD']
+    else: 
+      votes_data['amountUSD'] = votes_data['starting_amountUSD']
+
+    votes_data['amountUSD'] = votes_data['amountUSD'] * np.interp(votes_data['score'], score_range, scale_range)
+
+    # If the score is below 15, set the scaling factor to 0
+    votes_data.loc[votes_data['score'] < 15, 'amountUSD'] = 0
+
+    # If the score is above 25, set the scaling factor to 1
+    votes_data.loc[votes_data['score'] >= 25, 'amountUSD'] = votes_data['starting_amountUSD']
+
+    return votes_data
+
+def flag_base_votes(votes, min_donation_threshold, score_threshold):
+    votes['self_vote'] = (votes['voter'] == votes['payoutAddress']).astype(int) 
+    votes['low_score'] = (votes['score'] < score_threshold).astype(int) 
+    votes['low_amount'] = (votes['amountUSD'] < min_donation_threshold).astype(int) 
+    # base votes are when low_amount = 0, low_score = 0, self_vote = 0
+    votes['base_vote'] = np.all([votes['low_amount'] == 0, votes['low_score'] == 0, votes['self_vote'] == 0], axis=0).astype(int)
+
+    return votes
+
+def prep_donations_data(votes_data, min_donation_threshold, score_threshold):
+  votes_data = flag_base_votes(votes_data, min_donation_threshold, score_threshold)
+  votes_data = apply_sliding_scale(votes_data)
+  return votes_data
+
+def pivot_votes(round_votes):
+    pivot_votes = round_votes.pivot_table(index='voter', columns='project_name', values='amountUSD', fill_value=0)
+    return pivot_votes
+
+def get_qf_matching(algo, donation_df, matching_cap_percent, matching_amount, cluster_df = None):
+    projects = donation_df.columns
+    if algo == 'donation_profile_clustermatch':
+        funding = donation_profile_clustermatch(donation_df)
+    elif algo == 'COCM':
+        funding = COCM(donation_df, cluster_df)
+    elif algo == 'pairwise':
+        funding = pairwise(donation_df)
+    else:
+        funding = standard_qf(donation_df)
+    total_money = sum(funding.values())
+    funding_normalized = {p: funding[p]/total_money for p in projects} 
+    # Create DataFrame with 'project_name' and 'matching_amount' columns
+    result = pd.DataFrame(list(funding_normalized.items()), columns=['project_name', 'matching_amount'])
+    # Apply the cap to the 'matching_amount' column
+    if matching_cap_percent < 100.0:
+      result['matching_amount'] = check_matching_cap(result['matching_amount'], matching_cap_percent/100)
+    # Scale the 'matching_amount' column by the total matching amount
+    result['matching_percent'] = result['matching_amount'] * 100
+    result['matching_amount'] = result['matching_amount'] * matching_amount
+    return result