StackML/feature_engineering.py at master · ggiesa/StackML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd


class FeatureEngineering:
# class 'features' generates a formatted training and test set with the option to
# generates staggered target variable columns for training a model to
# predict <y_gen> time steps into the future.

    def __init__(self, data,
                 target_variable = 'price',
                 split_percent = .8,
                 y_gen = 0,
                 keep_all = True,
                 reset_index = False):

        '''
        - 'data' should be a pandas DataFrame of numerical data with columns for
            each feature, including the target variable.

        - target_variable should be a string with the name of the column that contains
            the target variable

        - split_percent should be a decimal with the percentage of data the model
            should train on. (ex. 90 percent train, 10 percent test --> split_percent = .9)

        - y_gen should be the number of time steps to generate y to. y_gen = 3
            will produce 4 columns; col[0] being y, col[1:n] being (y + 1 timestep)...(y + n timesteps)

        - keep_all toggles whether we want to keep columns between y and (y + n timesteps), or just
            keep y and (y + n timesteps). In other words, col[0] and col[n], or col[0:n]

        '''


        assert 0 < split_percent < 1, 'split percent must be a decible between 0 and 1'
        assert type(data) == pd.core.frame.DataFrame, 'data must be a pandas dataframe'

        # Check to be sure that 'data' only contains numerical data that can be trained on
        for column in data.columns:
            assert (type(data[column][0]) in [int, float, np.float64]), \
                    'Columns must only contain numerical data. Columns in question: {}'.format(column)


        self.target_variable = target_variable
        self.split_percent = split_percent
        self.y_gen = y_gen
        self.keep_all = keep_all
        self.reset_index = reset_index

        # Generate target data columns
        self.data = self.__generate_y(data)

        # Generate the rounded index for splitting the dataset
        self.split_ind = self.__split(self.split_percent)

        # Split data into X and Y
        self.X = self.data.iloc[:, :(self.data.shape[1]-(self.y_gen+1))]
        if self.y_gen > 0:
            self.Y = self.data.iloc[:, (self.data.shape[1]-(self.y_gen+1)):]
        else:
            self.Y = self.data[target_variable]

        # Split Y into train and test sets
        if self.Y.shape == (self.Y.shape[0],):
            self.ytrain = self.Y[:self.split_ind]
            self.ytest = self.Y[self.split_ind:]
        else:
            self.ytrain = self.Y.iloc[:self.split_ind, :]
            self.ytest = self.Y.iloc[self.split_ind:, :]

        # Split X into train and test sets
        self.Xtrain = self.X.iloc[:self.split_ind, :]
        self.Xtest = self.X.iloc[self.split_ind:, :]


    def __split(self, split_percent):
        length = self.data.shape[0]
        split_ind = round(length*split_percent)
        return split_ind


    def __generate_y(self, data):
    # Generate Y data for n number of time steps beyond 'current'

        if self.y_gen == 0:
            if self.reset_index:
                print('Despite index not being affected by y_gen, reseting index')

            return data.reset_index(drop=True)

        if self.reset_index:
            data = data.reset_index(drop = True)

        tempY = list(data[self.target_variable])
        size = len(tempY)
        Y = pd.DataFrame()
        for i in range(self.y_gen + 1):
            if i == 0:
                Y['current'] = np.ones(size)
            else:
                Y['%d_dt_future' %(i)] = np.ones(size)

        for i in range(self.y_gen + 1):
            Y.iloc[:size-i, i] = tempY[i:size+1]

        data = pd.concat([data,Y], axis = 1)
        data = data.drop([i for i in range(len(Y)-self.y_gen, len(Y))])
        data = data.drop(self.target_variable, axis = 1)

        return data


    def ret(self):

        if self.keep_all:
            return self.Xtrain, self.ytrain, self.Xtest, self.ytest
        else:
            if self.y_gen == 0:
                return self.Xtrain, self.ytrain, self.Xtest, self.ytest
            else:
                return self.Xtrain, self.ytrain.iloc[:, self.y_gen], \
                       self.Xtest,  self.ytest.iloc[:,  self.y_gen]