Skip to main content

A collection of our functions and classes from bootcamp.

Project description


  • A collection of tools created for botcmap.
  • More information to be added later.

Table of Contents

    name = "JMI_MVM"
    help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import seaborn as sns
    def calc_roc_auc(X_test,y_test,dtc,verbose=False):
        """Tests the results of an already-fit classifer. 
        Takes X_test, y_test, classifer, verbose (True" print result)
        Returns the AUC for the roc_curve as a %"""
        y_pred = dtc.predict(X_test)
        FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
        roc_auc = auc(FP_rate,TP_rate)
        roc_auc_perc = round(roc_auc*100,3)
        # Your code here 
        if verbose:
            print(f"roc_curve's auc = {roc_auc_perc}%")
        return roc_auc_perc
    def tune_params(param_name, param_values):
        """Takes in param_name to tune with param_values, plots train vs test AUC's. 
        Returns df_results and df_style with color coded results"""
        res_list = [[param_name,'train_roc_auc','test_roc_auc']]
        # Loop through all values in param_values
        for value in param_values:
            # Create Model, set params
            dtc_temp = DecisionTreeClassifier(criterion='entropy')
            # Fit model
  , y_train)
            # Get roc_auc for training data
            train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
            # Get roc_auc for test data
            test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
            # Append value and results to res_list
        # Turn results into df_results (basically same as using list2df)
        df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
        # Plot df_results
        # Color-coded dataframe s
        import seaborn as sns
        cm = sns.light_palette("green", as_cmap=True)
        df_syle =,low=results.min(),high=results.max())
        return df_results, df_syle
    from string import ascii_letters
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    def multiplot(df):
        """Plots results from df.corr() in a correlation heat map for multicollinearity.
        Returns fig, ax objects"""
        # Compute the correlation matrix
        corr = df.corr()
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(16, 16))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
        square=True, linewidths=.5, cbar_kws={"shrink": .5})
        return f, ax
    # Plots histogram and scatter (vs price) side by side
    # Plots histogram and scatter (vs price) side by side
    def plot_hist_scat_sns(df, target='index'):
        """Plots seaborne distplots and regplots for columns im datamframe vs target.
        df (DataFrame): DataFrame.describe() columns will be used. 
        target = name of column containing target variable.assume first coluumn. 
        Figures for each column vs target with 2 subplots.
        import matplotlib.ticker as mtick
        import matplotlib.pyplot as plt
        import seaborn as sns
            ###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------##
            # Axis Label fonts
            fontTitle = {'fontsize': 14,
                       'fontweight': 'bold',
            fontAxis = {'fontsize': 12,
                       'fontweight': 'medium',
            fontTicks = {'fontsize': 8,
            # Formatting dollar sign labels
            fmtPrice = '${x:,.0f}'
            tickPrice = mtick.StrMethodFormatter(fmtPrice)
            ###  PLOTTING ----------------------------- ------------------------ ##
            # Loop through dataframe to plot
            for column in df.describe():
    #             print(f'\nCurrent column: {column}')
                # Create figure with subplots for current column
                fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
                ##  SUBPLOT 1 --------------------------------------------------##
                i,j = 0,0
                # Define graphing keyword dictionaries for distplot (Subplot 1)
                hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
                kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
                # Plot distplot on ax[i,j] using hist_kws and kde_kws
                sns.distplot(df[column], norm_hist=True, kde=True,
                             hist_kws = hist_kws, kde_kws = kde_kws,
                             label=column+' histogram', ax=ax[i,j])
                # Set x axis label
                # Get x-ticks, rotate labels, and return
                xticklab1 = ax[i,j].get_xticklabels(which = 'both')
                ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
                # Set y-label 
                # Set y-grid
                ax[i, j].set_axisbelow(True)
                ax[i, j].grid(axis='y',ls='--')
                ##  SUBPLOT 2-------------------------------------------------- ##
                i,j = 0,1
                # Define the kwd dictionaries for scatter and regression line (subplot 2)
                scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
                # Plot regplot on ax[i,j] using line_kws and scatter_kws
                sns.regplot(df[column], df[target], 
                            line_kws = line_kws,
                            scatter_kws = scatter_kws,
                # Set x-axis label
                 # Get x ticks, rotate labels, and return
                ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
                # Set  y-axis label
                # Get, set, and format y-axis Price labels
                yticklab = ax[i,j].get_yticklabels()
        #         ax[i,j].get_yaxis().set_major_formatter(tickPrice) 
                # Set y-grid
                ax[i, j].set_axisbelow(True)
                ax[i, j].grid(axis='y',ls='--')       
                ## ---------- Final layout adjustments ----------- ##
                # Deleted unused subplots 
                # Optimizing spatial layout
    #             plt.savefig(figtitle)
    # Tukey's method using IQR to eliminate 
    def detect_outliers(df, n, features):
        """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
        df (DataFrame): DataFrane containing columns of features
        n: default is 0, multiple outlier cutoff  
        Index of outliers for .loc
        Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
        df.loc[Outliers_to_drop] # Show the outliers rows
        data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
    # Drop outliers    
        outlier_indices = []
        # iterate over features(columns)
        for col in features:
            # 1st quartile (25%)
            Q1 = np.percentile(df[col], 25)
            # 3rd quartile (75%)
            Q3 = np.percentile(df[col],75)
            # Interquartile range (IQR)
            IQR = Q3 - Q1
            # outlier step
            outlier_step = 1.5 * IQR
            # Determine a list of indices of outliers for feature col
            outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
            # append the found outlier indices for col to the list of outlier indices 
            # select observations containing more than 2 outliers
            outlier_indices = Counter(outlier_indices)        
            multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
        return multiple_outliers 
    # describe_outliers -- calls detect_outliers
    def describe_outliers(df):
        """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
        out_count = 0
        new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
        for col in df.columns:
            outies = detect_outliers(df[col])
            out_count += len(outies) 
            new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
        new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
        return new_df
    #### Cohen's d
    def Cohen_d(group1, group2):
        '''Compute Cohen's d.
        # group1: Series or NumPy array
        # group2: Series or NumPy array
        # returns a floating point number 
        diff = group1.mean() - group2.mean()
        n1, n2 = len(group1), len(group2)
        var1 = group1.var()
        var2 = group2.var()
        # Calculate the pooled threshold as shown earlier
        pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
        # Calculate Cohen's d statistic
        d = diff / np.sqrt(pooled_var)
        return d
    def plot_pdfs(cohen_d=2):
        """Plot PDFs for distributions that differ by some number of stds.
        cohen_d: number of standard deviations between the means
        group1 = scipy.stats.norm(0, 1)
        group2 = scipy.stats.norm(cohen_d, 1)
        xs, ys = evaluate_PDF(group1)
        pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
        xs, ys = evaluate_PDF(group2)
        pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
        o, s = overlap_superiority(group1, group2)
        print('overlap', o)
        print('superiority', s)
    def list2df(list):#, sort_values='index'):
        """ Take in a list where row[0] = column_names and outputs a dataframe.
        Keyword arguments:
        set_index -- df.set_index(set_index)
        sortby -- df.sorted()
        df_list = pd.DataFrame(list[1:],columns=list[0])
    #     df_list = df_list[1:]
        return df_list
    def df_drop_regex(DF, regex_list):
        '''Use a list of regex to remove columns names. Returns new df.
            DF -- input dataframe to remove columns from.
            regex_list -- list of string patterns or regexp to remove.
            df_cut -- input df without the dropped columns. 
        df_cut = DF.copy()
        for r in regex_list:
            df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
            print(f'Removed {r}\n')
        return df_cut
    ####### MIKE's PLOTTING
    # plotting order totals per month in violin plots
    def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
      '''Plots a violin plot with horizontal mean line, inner stick lines'''
      fig,ax =plt.subplots(figsize=(12,10))
      sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
                     saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
      ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
      x= df_year_orders['month']
      y= df_year_orders['order_total']
      title = 'Order totals per month with or without discounts'
    ### Example usage
    # #First, declare variables to be plotted
    # x = df_year_orders['month']
    # y = df_year_orders['order_total']
    # ticks = [v for v in month_dict.values()] 
    # title = 'Order totals per month with or without discounts'
    # hue = df_year_orders['Discount']>0
    ### Then call function
    # make_violinplot(x,y,title,hue, ticks), 
    def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
        '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
        ## ADDING add_gridspec usage
        import pandas as pd
        import numpy as np
        from scipy.stats import sem
        import matplotlib.pyplot as plt
        import matplotlib as mpl
        import matplotlib.ticker as ticker
        import seaborn as sns
        from matplotlib import rcParams
        from matplotlib import rc
        rcParams[''] = 'serif'
        # Plot distributions of discounted vs full price groups'default')
        # with'tableau-colorblind10')):
            ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
           # Axis Label fonts
            fontSuptitle ={'fontsize': 22,
                       'fontweight': 'bold',
            fontTitle = {'fontsize': 10,
                       'fontweight': 'medium',
            fontAxis = {'fontsize': 10,
                       'fontweight': 'medium',
            fontTicks = {'fontsize': 8,
            ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
            plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
            # Create fig object and declare figsize
            fig = plt.figure(constrained_layout=True, figsize=(8,3))
            # Define gridspec to create grid coordinates             
            gs = fig.add_gridspec(nrows=1,ncols=10)
            # Assign grid space to ax with add_subplot
            ax0 = fig.add_subplot(gs[0,0:7])
            ax1 = fig.add_subplot(gs[0,7:10])
            #Combine into 1 list
            ax = [ax0,ax1]
            ### ------------------  SUBPLOT 1  ------------------ ###
            ## --------- Defining series1 and 2 for subplot 1------- ##
            ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
            # Group 1: data, label, hist_kws and kde_kws
            plotS1 = {'data': series1, 'label': sname1.title(),
                       'hist_kws' :
                        {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
                        {'color':'gray', 'linestyle': '--', 'linewidth':2,
            # Group 2: data, label, hist_kws and kde_kws
            plotS2 = {'data': series2,
                        'label': sname2.title(), 
                        'hist_kws' :
                        {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
            # plot group 1
            sns.distplot(plotS1['data'], label=plotS1['label'],
                         hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
            # plot group 2
            sns.distplot(plotS2['data'], label=plotS2['label'],
                         hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
            ax[0].set_xlabel(, fontdict=fontAxis)
            ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
            ### ------------------  SUBPLOT 2  ------------------ ###
            # Import scipy for error bars
            from scipy.stats import sem
            # Declare x y group labels(x) and bar heights(y)
            x = [plotS1['label'], plotS2['label']]
            y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
            yerr = [sem(plotS1['data']), sem(plotS2['data'])]
            err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
            # Create the bar plot
            ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
            # Customize subplot 2
            ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
            ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
            test = ax1.get_xticklabels()
            labels = [x.get_text() for x in test]
            ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
    #         xlab = [x.get_text() for x in xlablist]
    #         ax[1].set_xticklabels(xlab,rotation=45)
    #         fig.savefig('H1_EDA_using_gridspec.png')
    #         plt.tight_layout()
        #     print(f')
            return fig,ax

    Project details

    Download files

    Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

    Files for JMI-MVM, version 0.3.4
    Filename, size File type Python version Upload date Hashes
    Filename, size JMI_MVM-0.3.4-py3-none-any.whl (65.6 kB) File type Wheel Python version py3 Upload date Hashes View
    Filename, size JMI_MVM-0.3.4.tar.gz (41.5 kB) File type Source Python version None Upload date Hashes View

    Supported by

    AWS AWS Cloud computing Datadog Datadog Monitoring DigiCert DigiCert EV certificate Facebook / Instagram Facebook / Instagram PSF Sponsor Fastly Fastly CDN Google Google Object Storage and Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Salesforce Salesforce PSF Sponsor Sentry Sentry Error logging StatusPage StatusPage Status page