1 year ago
Abdelaziz BEN-EL-GADA
Training Loss decreasing but Validation Loss is stable
I am trying to train a neural network I took from this paper https://scholarworks.rit.edu/cgi/viewcontent.cgi?referer=&httpsredir=1&article=10455&context=theses. See this image: Neural Network Architechture
I am using pytorch-lightning to use multi-GPU training.
I am feeding this network 3-channel optical flows (UVC: U is horizontal temporal displacement, V is vertical temporal displacement, C represents the confidence map).
Ouputs represent the frame to frame pose and they are in the form of a vector of 6 floating values ( translationX, tanslationY, translationZ, Yaw, Pitch, Roll). Translations vary from -0.25 to 3 in meters and rotations vary from -6 to 6 in degrees.
Outputs dataset is taken from kitti-odometry dataset, there is 11 video sequences, I used the first 8 for training and a portion of the remaining 3 sequences for evaluating during training.
I trained the model for 200 epochs ( took 33 hours on 8 GPUs ).
During this training, training loss decreases but validation loss remains constant during the whole training process.
transform = transforms.Compose(
transforms.Normalize((0.3973, 0.2952, 0.4500), (0.4181, 0.4362, 0.3526))])
batch_size = 8
val_data_percentage = 0.06
epochs = 200
learning_rate = 0.0001
train_dataset = FlowsAndPoses("./uvc_flows_png/train/", "./relative_poses/train/", transform)
test_dataset = FlowsAndPoses("./uvc_flows_png/test/", "./relative_poses/test/", transform)
dataset_length = test_dataset.__len__()
test_dataset, val_dataset = random_split(test_dataset,[int(dataset_length*(1-val_data_percentage)),dataset_length - int(dataset_length*(1-val_data_percentage))])
print("Train: ",train_dataset.__len__(), " Validation: ", val_dataset.__len__())
criterion = nn.L1Loss()
class Net(pl.LightningModule):
def __init__(self):
self.conv1 = nn.Conv2d(3, 64, 7, 2)
self.conv2 = nn.Conv2d(64, 128, 5, 2)
self.conv3 = nn.Conv2d(128, 256, 5, 2)
self.conv4 = nn.Conv2d(256, 256, 3, 1)
self.conv5 = nn.Conv2d(256, 512, 3, 2)
self.conv6 = nn.Conv2d(512, 512, 3, 1)
self.conv7 = nn.Conv2d(512, 512, 3, 2)
self.conv8 = nn.Conv2d(512, 512, 3, 1)
self.conv9 = nn.Conv2d(512, 1024, 3, 2)
self.fc1 = nn.Linear(32768, 1024)
self.drop = nn.Dropout(0.5)
self.fc2 = nn.Linear(1024, 6)
self.net_relu = nn.LeakyReLU(0.1)
def forward(self, x):
x = self.net_relu(self.conv1(x))
x = self.net_relu(self.conv2(x))
x = self.net_relu(self.conv3(x))
x = self.net_relu(self.conv4(x))
x = self.net_relu(self.conv5(x))
x = self.net_relu(self.conv6(x))
x = self.net_relu(self.conv7(x))
x = self.net_relu(self.conv8(x))
x = self.net_relu(self.conv9(x))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = self.net_relu(self.fc1(x))
x = self.drop(x)
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
running_loss = 0
print("Training: ")
inputs, labels = batch
outputs = self.forward(inputs.float())
loss = criterion(outputs, labels.float())
self.log("my_loss", loss, on_epoch=True)
return loss
def training_epoch_end(self, training_step_outputs):
training_loss_file = open("losses/training_loss"+str(self.current_epoch)+"_"+str(self.global_step), "w")
torch.save(self.state_dict(), "checkpoints/trained_model_epoch"+str(self.current_epoch)+".pth")
print("error saving")
def validation_step(self, batch, batch_idx):
inputs, labels = batch
outputs = self.forward(inputs.float())
loss = criterion(outputs, labels.float())
self.log("val_loss", loss)
return loss
def validation_epoch_end(self, validation_step_outputs):
valid_loss_file = open("losses/validation_loss"+str(self.current_epoch)+"_"+str(self.global_step), "w")
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
return optimizer
autoencoder = Net()
trainer = pl.Trainer(gpus=[0,1,2,3,4,5,6,7], accelerator="gpu", strategy="ddp", enable_checkpointing=True, max_epochs=epochs, check_val_every_n_epoch=1)
trainer.fit(autoencoder, DataLoader(train_dataset, batch_size=batch_size, shuffle=True), DataLoader(val_dataset, batch_size=batch_size, shuffle=True))
Zero Grad and optimizer.step are handled by the pytorch-lightning library.
The results I got are in the following images:
Validation loss during training
If anyone has suggestions on how to address this problem, I would really apreciate it.
