Skip to content

Added RMSProp Optimizer subroutine #144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 81 additions & 2 deletions example/quadratic.f90
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,23 @@ program quadratic_fit
! stochastic gradient descent, batch gradient descent, and minibatch gradient
! descent.
use nf, only: dense, input, network
use nf_dense_layer, only: dense_layer

implicit none
type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd
type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop

! Training parameters
integer, parameter :: num_epochs = 1000
integer, parameter :: train_size = 1000
integer, parameter :: test_size = 30
integer, parameter :: batch_size = 10
real, parameter :: learning_rate = 0.01
real, parameter :: decay_rate = 0.9

! Input and output data
real, allocatable :: x(:), y(:) ! training data
real, allocatable :: xtest(:), ytest(:) ! testing data
real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:)
real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)

integer :: i, n

Expand All @@ -42,6 +44,7 @@ program quadratic_fit
net_sgd = network([input(1), dense(3), dense(1)])
net_batch_sgd = network([input(1), dense(3), dense(1)])
net_minibatch_sgd = network([input(1), dense(3), dense(1)])
net_rms_prop = network([input(1), dense(3), dense(1)])

! Print network info to stdout; this will be the same for all three networks.
call net_sgd % print_info()
Expand All @@ -55,15 +58,20 @@ program quadratic_fit
! Mini-batch SGD optimizer
call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)

! RMSProp optimizer
call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)

! Calculate predictions on the test set
ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)]
ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)]
ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)]
ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1, test_size)]

! Print the mean squared error
print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest)
print '(" Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest)
print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest)
print '(" RMSProp MSE: ", f9.6)', sum((ypred_rms_prop - ytest)**2) / size(ytest)

contains

Expand Down Expand Up @@ -161,6 +169,77 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
end do
end subroutine minibatch_gd_optimizer

subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
! RMSprop optimizer for updating weights using root mean square
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: learning_rate, decay_rate
integer, intent(in) :: num_epochs
integer :: i, j, n
real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero

! Define a dedicated type to store the RMSprop gradients.
! This is needed because array sizes vary between layers and we need to
! keep track of gradients for each layer over time.
! For now this works only for dense layers.
! We will need to define a similar type for conv2d layers.
type :: rms_gradient_dense
real, allocatable :: dw(:,:)
real, allocatable :: db(:)
end type rms_gradient_dense

type(rms_gradient_dense), allocatable :: rms(:)

print *, "Running RMSprop optimizer..."

! Here we allocate the array or RMS gradient derived types.
! We need one for each dense layer, however we will allocate it to the
! length of all layers as it will make housekeeping easier.
allocate(rms(size(net % layers)))

do n = 1, num_epochs

do i = 1, size(x)
call net % forward([x(i)])
call net % backward([y(i)])
end do

! RMSprop update rule
do j = 1, size(net % layers)
select type (this_layer => net % layers(j) % p)
type is (dense_layer)

! If this is our first time here for this layer, allocate the
! internal RMS gradient arrays and initialize them to zero.
if (.not. allocated(rms(j) % dw)) then
allocate(rms(j) % dw, mold=this_layer % dw)
allocate(rms(j) % db, mold=this_layer % db)
rms(j) % dw = 0
rms(j) % db = 0
end if

! Update the RMS gradients using the RMSprop moving average rule
rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw**2
rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db**2

! Update weights and biases using the RMSprop update rule
this_layer % weights = this_layer % weights - learning_rate &
/ sqrt(rms(j) % dw + epsilon) * this_layer % dw
this_layer % biases = this_layer % biases - learning_rate &
/ sqrt(rms(j) % db + epsilon) * this_layer % db

! We have updated the weights and biases, so we need to reset the
! gradients to zero for the next epoch.
this_layer % dw = 0
this_layer % db = 0

end select
end do

end do

end subroutine rmsprop_optimizer

subroutine shuffle(arr)
! Shuffle an array using the Fisher-Yates algorithm.
integer, intent(inout) :: arr(:)
Expand Down