Posts Python Data Analysis Notes
Post
Cancel

Python Data Analysis Notes

  1. Notes in STAT 8051 project

    • Dataset description
      1
      2
      
        train_data.describe(include=["object", "category"])
        train_data.info()
      
    • Check the number of NA values
      1
      
        train_data.shape[0] - train_data.dropna().shape[0]
      
    • Set index
      1
      
        train_data = train_data.set_index('claim_number')
      
    • Extract rows
      1
      
        train_data.loc[train_data.annual_income==-1, 'annual_income'] = np.nan
      
    • Transform into categorical variables
      1
      
        train_data["marital_status"] = pd.Categorical(train_data["marital_status"])
      
    • Print the list of missing columns
      1
      
        print(list(itertools.compress(list(train_data), list(train_data.isna().any()))))
      
    • Use mean and mode to do imputation
      1
      2
      3
      4
      5
      
        annual_income_mean = train_data.annual_income.mean()
        train_data['annual_income'].fillna(annual_income_mean, inplace=True)
      
        marital_status_mode = train_data.marital_status.mode().values[0]
        train_data['marital_status'].fillna(marital_status_mode, inplace=True)
      
    • One-hot encoding
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      
        # one-hot encoding for state
        state_dummies = pd.get_dummies(test_data['state'],
                                    prefix='state', drop_first=True)
        test_data = pd.concat([test_data, state_dummies], axis=1)
        test_data.drop(["state"], axis=1, inplace=True)
      
        ### clean up variable names by making them all lowercase with underscore separators.
        train_data.columns = map(
            lambda s: s.lower().replace(' ', '_'), 
            train_data.columns
        )
      
    • Add grouped-by means as the new feature
      1
      2
      3
      4
      5
      6
      7
      8
      
        ## marital_status
        grouped_marital_status = train["fraud"].groupby(train['marital_status'])
        grouped_marital_status_mean = grouped_marital_status.mean().to_frame()
        grouped_marital_status_mean['marital_status']=grouped_marital_status_mean.index
        grouped_marital_status_mean['fraud_marital_status'] = grouped_marital_status_mean['fraud']
        grouped_marital_status_mean.drop('fraud', axis = 1, inplace = True)
        train = pd.merge(train, grouped_marital_status_mean, on = "marital_status", how = "left")
        test = pd.merge(test, grouped_marital_status_mean, on = "marital_status", how = "left")
      
    • Seperate dataset into training data and validation data
      1
      2
      3
      4
      5
      6
      
        ## generate row indexes
        random.seed(300)
        rindex =  np.array(sample(range(len(train)), round(0.7 * len(train))))
      
        train_df = train.iloc[rindex, :]
        validation_df = train.drop(train.index[rindex])
      
    • Drop a list of variables at once
      1
      
        train_df.drop(["claim_year", "claim_day", "zip_code", "claim_date", "claim_number"], axis =1, inplace=True)
      
    • Show the head and tail of a dataframe
      1
      2
      
        data.head()
        data.tail()
      
  2. Plots

    • Barplot with the values as the labels.
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      
        import matplotlib.pyplot as plt; plt.rcdefaults()
        import numpy as np
        import matplotlib.pyplot as plt
        import matplotlib.axes as ax
              
        objects = ('Adaboost', 'Random Forest', 'Logistic Regression',
                'XGBoost', 'LightGBM', 'XGBoost_LightGBM')
        y_pos = np.arange(len(objects))
        performance = (0.719963411, 0.711689776, 0.709100835, 0.725387261,
                    0.729196909, 0.730128054)
      
        plt.figure(figsize=(8, 6))
        plt.ylim(0.70, 0.74)
        rects = plt.bar(y_pos, performance, align='center', alpha=0.5)
        plt.xticks(y_pos, objects)
        plt.xticks(rotation=30)
        plt.ylabel('AUC')
        plt.title('Model Performance')
      
        def autolabel(rects):
            for rect, perf in zip(rects, performance):
                height = rect.get_height()
                plt.text(rect.get_x() - rect.get_width()/6, 1.001*perf, '%s' % float(perf))
        autolabel(rects)
      
        plt.show()
      
    • Plot binned average
      1
      2
      3
      4
      5
      6
      
        grouped_safty_rating = train["fraud"].groupby(train['safty_rating'])
        grouped_safty_rating_mean = grouped_safty_rating.mean().to_frame()
        grouped_safty_rating_mean['safty_rating'] = grouped_safty_rating_mean.index
        plt.scatter(grouped_safty_rating_mean["safty_rating"], grouped_safty_rating_mean['fraud'])
        plt.xlabel("safty_rating")
        plt.ylabel("fraud rate")
      
This post is licensed under CC BY 4.0 by the author.