1 year ago
#357277
WONJUN LEE
How to use sklearn.feature_selection with 3D numpy array data
I have a 3D data like this (150, 1040, 82) = (frame, feature, num_data) . It is like 3D tensor. This data is X_data and Y_data is (82,) whole code is under the code.
The labeling_load() function consists of X = 1040, Y = 150, where X is a feature and Y is a frame. And this 2d data finally has the form of (1040, 150, n). And the output result, Y, matches the number of n.
def labeling_load():
lableing_name = pd.read_excel("data.xlsx")
pd_labeling_name = pd.DataFrame(lableing_name)
Y_data = []
k = 0
for ID in os.listdir("test_han/"):
ID_lable = pd_labeling_name["ID"] == str(ID)
Human_lable = pd_labeling_name["정김령"] == 0
Final_lable = pd_labeling_name[ID_lable & Human_lable]
list_final_lable = Final_lable.values.tolist()
path = "test_han/" + str(ID) + "/"
file = os.listdir(path)
list_num_section_m1 = []
list_num_section_0 = []
list_num_section_1 = []
list_num_section_2 = []
list_num_section_3 = []
list_num_section_4 = []
for i in range(len(file)):
game_state = str(file[i].split("_")[0])
num_section = str(file[i].split("_")[1].split(".")[0])
if game_state == "-1":
list_num_section_m1.append(int(num_section))
elif game_state == "0":
list_num_section_0.append(int(num_section))
elif game_state == "1":
list_num_section_1.append(int(num_section))
elif game_state == "2":
list_num_section_2.append(int(num_section))
elif game_state == "3":
list_num_section_3.append(int(num_section))
elif game_state == "4":
list_num_section_4.append(int(num_section))
for j in range(len(list_num_section_m1)):
if k == 0:
data_m1 = pd.read_csv(path + "-1_" + str(j) + ".csv")
data_m1 = data_m1.replace('Nan', -100)
data_m1 = data_m1.iloc[0:,2:1042]
# print(m1_data)
X_data = data_m1.to_numpy()
# print(np_data_1)
label_m1 = list_final_lable[0][j+8]
# print(X_data)
Y_data.append(label_m1)
k = 2
else:
data_m1 = pd.read_csv(path + "-1_" + str(j) + ".csv")
data_m1 = data_m1.replace('Nan', -100)
data_m1 = data_m1.iloc[0:, 2:1042]
# print(m1_data)
np_data_m1 = data_m1.to_numpy()
# print(np_data_1)
label_m1 = list_final_lable[0][j + 8]
if label_m1 == 0 or label_m1 == 1:
if np_data_m1.shape[0] == 150:
X_data = np.dstack((X_data, np_data_m1))
Y_data.append(label_m1)
for j in range(len(list_num_section_0)):
data_0 = pd.read_csv(path + "0_" + str(j) + ".csv")
data_0 = data_0.replace('Nan', -100)
data_0 = data_0.iloc[0:, 2:1042]
# print(m1_data)
np_data_0 = data_0.to_numpy()
# print(np_data_1)
label_0 = list_final_lable[0][j + 54]
if label_0 == 0 or label_0 == 1:
if np_data_0.shape[0] == 150:
X_data = np.dstack((X_data, np_data_0))
Y_data.append(label_0)
for j in range(len(list_num_section_1)):
data_1 = pd.read_csv(path + "1_" + str(j) + ".csv")
data_1 = data_1.replace('Nan', -100)
data_1 = data_1.iloc[0:, 2:1042]
# print(m1_data)
np_data_1 = data_1.to_numpy()
# print(np_data_1)
label_1 = list_final_lable[0][j + 78]
if label_1 == 0 or label_1 == 1:
if np_data_1.shape[0] == 150:
X_data = np.dstack((X_data, np_data_1))
Y_data.append(label_1)
for j in range(len(list_num_section_2)):
data_2 = pd.read_csv(path + "2_" + str(j) + ".csv")
data_2 = data_2.replace('Nan', -100)
data_2 = data_2.iloc[0:, 2:1042]
# print(m1_data)
np_data_2 = data_2.to_numpy()
# print(np_data_1)
label_2 = list_final_lable[0][j + 102]
if label_2 == 0 or label_2 == 1:
if np_data_2.shape[0] == 150:
X_data = np.dstack((X_data, np_data_2))
Y_data.append(label_2)
for j in range(len(list_num_section_3)):
data_3 = pd.read_csv(path + "3_" + str(j) + ".csv")
data_3 = data_3.replace('Nan', -100)
data_3 = data_3.iloc[0:, 2:1042]
# print(m1_data)
np_data_3 = data_3.to_numpy()
# print(np_data_1)
label_3 = list_final_lable[0][j + 126]
if label_3 == 0 or label_3 == 1:
if np_data_3.shape[0] == 150:
X_data = np.dstack((X_data, np_data_3))
Y_data.append(label_3)
for j in range(len(list_num_section_4)):
data_4 = pd.read_csv(path + "4_" + str(j) + ".csv")
data_4 = data_4.replace('Nan', -100)
data_4 = data_4.iloc[0:, 2:1042]
# print(m1_data)
np_data_4 = data_4.to_numpy()
# print(np_data_1)
label_4 = list_final_lable[0][j + 150]
if label_4 == 0 or label_4 == 1:
if np_data_4.shape[0] == 150:
X_data = np.dstack((X_data, np_data_4))
Y_data.append(label_4)
print(len(X_data), len(Y_data))
X_data = np.array(X_data)
Y_data = np.array(Y_data)
def randomforest2():
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=10)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
f = open("importance.csv", "w")
wr = csv.writer(f)
wr.writerow(importances)
f.close()
print(len(importances))
print(importances)
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="lightsalmon", align="center")
plt.xlim([-1, X_train.shape[1]])
plt.show()
if __name__ == "__main__":
labeling_load()
X = X_data
y = Y_data
randomforest2()
Error massage is
Traceback (most recent call last):
File "C:\Users\Wonjun\PycharmProjects\ADHD\labeling_load.py", line
228, in <module>
randomforest2()
File "C:\Users\Wonjun\PycharmProjects\ADHD\labeling_load.py", line
195, in randomforest2
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=10)
File "C:\Users\Wonjun\anaconda3\lib\site-
packages\sklearn\model_selection\_split.py", line 2172, in
train_test_split
arrays = indexable(*arrays)
File "C:\Users\Wonjun\anaconda3\lib\site-
packages\sklearn\utils\validation.py", line 356, in indexable
check_consistent_length(*result)
File "C:\Users\Wonjun\anaconda3\lib\site-
packages\sklearn\utils\validation.py", line 319, in
check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
ValueError: Found input variables with inconsistent numbers of
samples: [150, 82]
scikit-learn
feature-selection
0 Answers
Your Answer