import pandas as pd import numpy as np import metrics as mt # metrics I'm not inclined to include in the paper # 'DecT':get_declin_tilde,\ # 'BDecT':get_bdec_tilde,\ # all of the metrics we're considering in this paper # note that Lop and EVW include corresponding statistical data # see populate_metrics function for how this is recorded metric_dict = {'Dec': mt.get_declin,\ 'BDec':mt.get_bdec,\ 'EG':mt.get_EG,\ 'LG':mt.get_EG_loss_only,\ 'DG':mt.get_EG_difference,\ 'SG':mt.get_EG_surplus_only,\ 'VC1':mt.get_EG_vote_centric_one,\ 'VC2':mt.get_EG_vote_centric_two,\ 'Bias':mt.get_bias,\ 'MM':mt.get_mean_median,\ 'Lop':mt.get_lop_list,\ 'EVW':mt.get_evw} # exclude evw m_noevw = dict(metric_dict) del m_noevw['EVW'] # exclude evw m_noevw_nosg = dict(m_noevw) del m_noevw_nosg['SG'] # Todo: Rewrite so it's clearer which metrics not being included in this m_list = {'Dec': mt.get_declin,\ 'BDec':mt.get_bdec,\ 'EG':mt.get_EG,\ 'LG':mt.get_EG_loss_only,\ 'DG':mt.get_EG_difference,\ 'SG':mt.get_EG_surplus_only,\ 'VC1':mt.get_EG_vote_centric_one,\ 'VC2':mt.get_EG_vote_centric_two,\ 'Bias':mt.get_bias,\ 'MM':mt.get_mean_median,\ 'Lop':mt.get_lop_list} rescale_suffix = '-R' absolute_value_suffix = '-abs' rms_cols = [x + rescale_suffix for x in ['EG','BDec','MM','Bias']] ##################################################################### # general code for working with the metrics dataframes ##################################################################### ############################################################################# def make_df(elecs): """ make a pandas dataframe from elections for holding metric values populate with basic data about the election """ elecdict = {'elecs': [x for x in elecs.keys()]} df = pd.DataFrame(elecdict) # This syntax is okay since df.elecs returns a series - only one axis to work with. df['year'] = df.elecs.apply(lambda x: x[:4]) df['state'] = df.elecs.apply(lambda x: x[5:7]) df['house'] = df.elecs.apply(lambda x: x[-2:] == '11') df['N'] = df.elecs.apply(lambda x: elecs[x].Ndists) # There's a MN election in database with no data - ignore it. df = df[df['N'] > 0] df['Davg'] = df.elecs.apply(lambda x: np.mean(elecs[x].demfrac)) df['SeatDiff'] = df.elecs.apply(lambda x: len(filter(lambda x: x > 0.5, elecs[x].demfrac)) - elecs[x].Ndists*1.0/2) df['Sfrac'] = df.elecs.apply(lambda x: len(filter(lambda x: x > 0.5, elecs[x].demfrac))*1.0/elecs[x].Ndists) # is the election competitive? df['comp'] = df.elecs.apply(lambda x: 0.45 <= np.mean(elecs[x].demfrac) <= 0.55) # did each party win at least one seat? df['nosw'] = df.elecs.apply(lambda x: 0 < len(filter(lambda y: y > 0.5, elecs[x].demfrac)) < elecs[x].Ndists) return df def populate_metrics(df,mdict,elecs): """ populate dataframe with metric values """ for fkey in mdict.keys(): df[fkey] = df.elecs.apply(lambda x: mdict[fkey](elecs[x].demfrac)) # make adjustments for columns corresponding to "tests" (rather than measures) df['Lop-t'] = df.Lop.apply(lambda x: x[1]) # value of t-statistic df['Lop-p'] = df.Lop.apply(lambda x: x[2]) # p-value df['Lop-sig'] = df.Lop.apply(lambda x: x[3] < 0.5) # whether statistically significant df['Lop'] = df.Lop.apply(lambda x: x[0]) # difference between winning vote shares # make adjustments for EVW df['EVW-sig'] = df.EVW.apply(lambda x: x[1]) df['EVW'] = df.EVW.apply(lambda x: x[0]) # now that we've fixed things up for fkey in mdict.keys(): df[fkey + absolute_value_suffix] = df[fkey].apply(lambda x: abs(x)) return df def rescale_metrics(df,mdict): """ rescale all metrics so standard deviation is 1. Note, not shifting mean? """ # This is here to get rid of a SettingWithCopyWarning df = df.copy() for x in mdict: df.loc[:,x + rescale_suffix] = df[x]/df[x].std() return df def get_sigma(df,mdict): """ return a dictionary containing standard deviations for requested columns """ nd = dict() for x in mdict.keys(): nd[x] = df[x].std() return nd def make_pairwise(df,measures): """ make pairwise comparisons for the specified measures """ arr = [] for i,m1 in enumerate(measures): for j,m2 in enumerate(measures): if j > i: df.loc[:,m1 + '-' + m2 + absolute_value_suffix] = abs(df.loc[:,m1 + rescale_suffix] - df.loc[:,m2 + rescale_suffix]) arr.append(m1 + '-' + m2) return arr def get_rms(row): return np.sqrt(sum([(row[x]-row['Avg'])**2 for x in rms_cols])) def find_least_consensus(df,n=8): """ add a new column with rms error among given measures """ df['Avg'] = df[rms_cols].apply(np.mean, axis='columns') df['RMS'] = df.apply(get_rms, axis='columns') # df.loc[:,'RMS'] = df.loc[:,measures]) tdf = df[['elecs','RMS']].sort_values('RMS') return tdf.nlargest(n,'RMS').reset_index(drop=True)