How to use sklearn.feature_selection with 3D numpy array data

2 years ago

#357277

WONJUN LEE

I have a 3D data like this (150, 1040, 82) = (frame, feature, num_data) . It is like 3D tensor. This data is X_data and Y_data is (82,) whole code is under the code.

The labeling_load() function consists of X = 1040, Y = 150, where X is a feature and Y is a frame. And this 2d data finally has the form of (1040, 150, n). And the output result, Y, matches the number of n.

def labeling_load():

 lableing_name = pd.read_excel("data.xlsx") 
 pd_labeling_name = pd.DataFrame(lableing_name)

 Y_data = []
 k = 0

 for ID in os.listdir("test_han/"):
    ID_lable = pd_labeling_name["ID"] == str(ID)
    Human_lable = pd_labeling_name["정김령"] == 0
    Final_lable = pd_labeling_name[ID_lable & Human_lable]
    list_final_lable = Final_lable.values.tolist()
    
    path = "test_han/" + str(ID) + "/"
    file = os.listdir(path)
    
    list_num_section_m1 = []
    list_num_section_0 = []
    list_num_section_1 = []
    list_num_section_2 = []
    list_num_section_3 = []
    list_num_section_4 = []

    for i in range(len(file)):
        game_state = str(file[i].split("_")[0])
        num_section = str(file[i].split("_")[1].split(".")[0])
        if game_state == "-1":
            list_num_section_m1.append(int(num_section))
        elif game_state == "0":
            list_num_section_0.append(int(num_section))
        elif game_state == "1":
            list_num_section_1.append(int(num_section))
        elif game_state == "2":
            list_num_section_2.append(int(num_section))
        elif game_state == "3":
            list_num_section_3.append(int(num_section))
        elif game_state == "4":
            list_num_section_4.append(int(num_section))

    for j in range(len(list_num_section_m1)):
        if k == 0:
            data_m1 = pd.read_csv(path + "-1_" + str(j) + ".csv")
            data_m1 = data_m1.replace('Nan', -100)
            data_m1 = data_m1.iloc[0:,2:1042]
            # print(m1_data)
            X_data = data_m1.to_numpy()
            # print(np_data_1)
            label_m1 = list_final_lable[0][j+8]
            # print(X_data)
            Y_data.append(label_m1)
            k = 2
        else:
            data_m1 = pd.read_csv(path + "-1_" + str(j) + ".csv")
            data_m1 = data_m1.replace('Nan', -100)
            data_m1 = data_m1.iloc[0:, 2:1042]
            # print(m1_data)
            np_data_m1 = data_m1.to_numpy()

            # print(np_data_1)
            label_m1 = list_final_lable[0][j + 8]

            if label_m1 == 0 or label_m1 == 1:
                if np_data_m1.shape[0] == 150:
                    X_data = np.dstack((X_data, np_data_m1))
                    Y_data.append(label_m1)

    for j in range(len(list_num_section_0)):
        data_0 = pd.read_csv(path + "0_" + str(j) + ".csv")
        data_0 = data_0.replace('Nan', -100)
        data_0 = data_0.iloc[0:, 2:1042]
        # print(m1_data)
        np_data_0 = data_0.to_numpy()
        # print(np_data_1)
        label_0 = list_final_lable[0][j + 54]

        if label_0 == 0 or label_0 == 1:
            if np_data_0.shape[0] == 150:
                X_data = np.dstack((X_data, np_data_0))
                Y_data.append(label_0)

    for j in range(len(list_num_section_1)):
        data_1 = pd.read_csv(path + "1_" + str(j) + ".csv")
        data_1 = data_1.replace('Nan', -100)
        data_1 = data_1.iloc[0:, 2:1042]
        # print(m1_data)
        np_data_1 = data_1.to_numpy()
        # print(np_data_1)
        label_1 = list_final_lable[0][j + 78]

        if label_1 == 0 or label_1 == 1:
            if np_data_1.shape[0] == 150:
                X_data = np.dstack((X_data, np_data_1))
                Y_data.append(label_1)

    for j in range(len(list_num_section_2)):
        data_2 = pd.read_csv(path + "2_" + str(j) + ".csv")
        data_2 = data_2.replace('Nan', -100)
        data_2 = data_2.iloc[0:, 2:1042]
        # print(m1_data)
        np_data_2 = data_2.to_numpy()
        # print(np_data_1)
        label_2 = list_final_lable[0][j + 102]

        if label_2 == 0 or label_2 == 1:
            if np_data_2.shape[0] == 150:
                X_data = np.dstack((X_data, np_data_2))
                Y_data.append(label_2)
    
    for j in range(len(list_num_section_3)):
        data_3 = pd.read_csv(path + "3_" + str(j) + ".csv")
        data_3 = data_3.replace('Nan', -100)
        data_3 = data_3.iloc[0:, 2:1042]
        # print(m1_data)
        np_data_3 = data_3.to_numpy()
        # print(np_data_1)
        label_3 = list_final_lable[0][j + 126]
        if label_3 == 0 or label_3 == 1:
            if np_data_3.shape[0] == 150:
                X_data = np.dstack((X_data, np_data_3))
                Y_data.append(label_3)

    for j in range(len(list_num_section_4)):
        data_4 = pd.read_csv(path + "4_" + str(j) + ".csv")
        data_4 = data_4.replace('Nan', -100)
        data_4 = data_4.iloc[0:, 2:1042]
        # print(m1_data)
        np_data_4 = data_4.to_numpy()
        # print(np_data_1)
        label_4 = list_final_lable[0][j + 150]
        if label_4 == 0 or label_4 == 1:
            if np_data_4.shape[0] == 150:
                X_data = np.dstack((X_data, np_data_4))
                Y_data.append(label_4)

print(len(X_data), len(Y_data))

X_data = np.array(X_data)
Y_data = np.array(Y_data)

def randomforest2():

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=10)

rf.fit(X_train, y_train)
importances = rf.feature_importances_

f = open("importance.csv", "w")
wr = csv.writer(f)
wr.writerow(importances)
f.close()

print(len(importances))
print(importances)


indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
        color="lightsalmon", align="center")
plt.xlim([-1, X_train.shape[1]])
plt.show()


if __name__ == "__main__":
labeling_load()
X = X_data
y = Y_data

randomforest2()

Error massage is

Traceback (most recent call last):
File "C:\Users\Wonjun\PycharmProjects\ADHD\labeling_load.py", line 
228, in <module>
randomforest2()
File "C:\Users\Wonjun\PycharmProjects\ADHD\labeling_load.py", line 
195, in randomforest2
X_train, X_test, y_train, y_test = train_test_split(X, y, 
test_size=0.2, random_state=10)
File "C:\Users\Wonjun\anaconda3\lib\site- 
packages\sklearn\model_selection\_split.py", line 2172, in 
train_test_split
arrays = indexable(*arrays)
File "C:\Users\Wonjun\anaconda3\lib\site- 
packages\sklearn\utils\validation.py", line 356, in indexable
check_consistent_length(*result)
File "C:\Users\Wonjun\anaconda3\lib\site- 
packages\sklearn\utils\validation.py", line 319, in 
check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
ValueError: Found input variables with inconsistent numbers of 
samples: [150, 82]

scikit-learn

feature-selection

0 Answers

Your Answer

Posts

Questions

Blogs