Merge pull request #8 from JuliaGNI/dev-manifold-optimizers

Implemented Householder reflections with modifications.
JuliaGNI · May 26, 2023 · 42c7fda · 42c7fda
2 parents 604354f + 47cd4e6
commit 42c7fda
Show file tree

Hide file tree

Showing 66 changed files with 2,035 additions and 629 deletions.
diff --git a/Project.toml b/Project.toml
@@ -10,7 +10,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
-Manifolds = "1cead3c2-87b3-11e9-0ccd-23c62b72b94e"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
@@ -25,7 +24,6 @@ Distances = "0.10"
 ForwardDiff = "0.10"
 HDF5 = "0.16"
 Lux = "0.4"
-Manifolds = "0.8"
 NLsolve = "4"
 NNlib = "0.8"
 Optimisers = "0.2"

diff --git a/README.md b/README.md
@@ -7,4 +7,8 @@
 [![CI](https://github.com/JuliaGNI/GeometricMachineLearning.jl/workflows/CI/badge.svg)](https://github.com/JuliaGNI/GeometricMachineLearning.jl/actions?query=workflow:CI)
 [![Codecov Status](https://codecov.io/gh/JuliaGNI/GeometricMachineLearning.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaGNI/GeometricMachineLearning.jl)
 
-This package implements various scientific machine learning models that aim at learning dynamical systems with geometric structure, such as Hamiltonian (symplectic) or Lagrangian (variational) systems.
+`GeometricMachineLearning.jl` offers a flexible tool for designing neural networks for dynamical systems with geometric structure, such as Hamiltonian (symplectic) or Lagrangian (variational) systems.
+
+At its core every neural network comprises three components: a neural network architecture, a loss function and an optimizer. 
+
+Traditionally, physical properties have been encoded into the loss function (PiNN approach), but in `GeometricMachineLearning.jl` this is exclusively done through the architectures and the optimizers of the neural network, thus giving theoretical guarentees that these properties are actually preserved.
diff --git a/scripts/autoencoder.jl b/scripts/autoencoder.jl
@@ -0,0 +1,76 @@
+using MLDatasets 
+using Random
+using GeometricMachineLearning
+using LinearAlgebra
+
+using Lux
+#Lux is needed for this flatten operation -> should be removed!
+import Flux, Zygote
+
+train_x, train_y = MNIST(split=:train)[:]
+#for visualization
+#using ImageInTerminal, ImageShow
+#convert2image(MNIST, train_x[:,:,1])
+
+train_x = Flux.flatten(train_x) |> gpu
+train_y = Flux.onehotbatch(train_y, 0:9) |> gpu
+
+#encoder layer
+Ψᵉ = Chain(
+    Dense(28*28, 16, tanh),
+    Dense(16, 16, tanh),
+    Dense(16,10, Lux.σ)
+    )
+
+#decoder layer
+Ψᵈ = Chain(
+    Dense(10, 16, tanh),
+    Dense(16, 16, tanh),
+    Dense(16, 28*28, Lux.σ)
+)
+
+const model = Chain(Ψᵉ, Ψᵈ)
+
+ps, st = Lux.setup(Random.default_rng(), model) .|> gpu
+
+#loss_sing
+function loss_sing(ps, x, y)
+    norm(Lux.apply(model, x, ps, st)[1] - x)
+end
+
+function loss_sing(ps, train_x, train_y, index)
+    loss_sing(ps, train_x[:, index], train_y[:, index])    
+end
+function full_loss(ps, train_x, train_y)
+    num = size(train_x, 2)
+    mapreduce(index -> loss_sing(ps, train_x, train_y, index), +, 1:num)
+end
+
+o = AdamOptimizer()
+cache = init_optimizer_cache(model, o)
+println("initial loss: ", full_loss(ps, train_x, train_y))
+
+training_steps = 100
+
+num = size(train_x, 2)
+batch_size = 10
+
+for i in 1:training_steps
+    #@time dp = Zygote.gradient(loss_closure, ps)[1]
+
+    index₁ = Int(ceil(rand()*num))
+    x = train_x[:, index₁]
+    y = train_y[:, index₁] 
+    l, pb = Zygote.pullback(ps -> loss_sing(ps, x, y), ps)
+    dp = pb(one(l))[1]
+
+    indices = Int.(ceil.(rand(batch_size -1)*num))
+    for index in indices
+        x = train_x[:, index] 
+        y = train_y[:, index] 
+        l, pb = Zygote.pullback(ps -> loss_sing(ps, x, y), ps)
+        dp = _add(dp, pb(one(l))[1])
+    end
+    optimization_step!(o, Ψᵉ, ps, cache, dp)
+end
+println("final loss: ", full_loss(ps, train_x, train_y))
diff --git a/scripts/classifier.jl b/scripts/classifier.jl
@@ -0,0 +1,72 @@
+using MLDatasets 
+using Random
+using GeometricMachineLearning
+using LinearAlgebra
+using ProgressMeter
+using Lux
+
+#Lux is needed for this flatten operation -> should be removed!
+import Flux, Zygote
+
+train_x, train_y = MNIST(split=:train)[:]
+test_x, test_y = MNIST(split=:test)[:]
+
+#for visualization
+#using ImageInTerminal, ImageShow
+#convert2image(MNIST, train_x[:,:,1])
+
+train_x = Flux.flatten(train_x) #|> gpu
+train_y = Flux.onehotbatch(train_y, 0:9) #|> gpu
+test_x = Flux.flatten(test_x) #|> gpu
+test_y = Flux.onehotbatch(test_y, 0:9) #|> gpu
+
+#encoder layer
+Ψᵉ = Chain(
+    Dense(28*28, 64, tanh),
+    Dense(64, 64, tanh),
+    Dense(64, 10, Lux.σ)
+    )
+
+ps, st = Lux.setup(Random.default_rng(), Ψᵉ)  #.|> gpu
+
+#loss_sing
+function loss_sing(ps, x, y)
+    norm(Lux.apply(Ψᵉ, x, ps, st)[1] - y)
+end
+function loss_sing(ps, train_x, train_y, index)
+    loss_sing(ps, train_x[:, index], train_y[:, index])    
+end
+function full_loss(ps, train_x, train_y)
+    num = size(train_x, 2)
+    mapreduce(index -> loss_sing(ps, train_x, train_y, index), +, 1:num)
+end
+
+num = size(train_x,2)
+batch_size = 64
+training_steps = 100
+
+
+o = AdamOptimizer()
+cache = init_optimizer_cache(Ψᵉ, o)
+
+println("initial loss: ", full_loss(ps, train_x, train_y)/num)
+
+@showprogress "Training network ..." for i in 1:training_steps
+    index₁ = Int(ceil(rand()*num))
+    x = train_x[:, index₁] #|> gpu
+    y = train_y[:, index₁] #|> gpu
+    l, pb = Zygote.pullback(ps -> loss_sing(ps, x, y), ps)
+    dp = pb(one(l))[1]
+
+    indices = Int.(ceil.(rand(batch_size -1)*num))
+    for index in indices
+        x = train_x[:, index] #|> gpu
+        y = train_y[:, index] #|> gpu
+        l, pb = Zygote.pullback(ps -> loss_sing(ps, x, y), ps)
+        dp = _add(dp, pb(one(l))[1])
+    end
+    optimization_step!(o, Ψᵉ, ps, cache, dp)
+end
+println("final loss: ", full_loss(ps, train_x, train_y)/num)
+
+println("\nfinal test loss: ", full_loss(ps, test_x, test_y)/size(test_x, 2))
diff --git a/scripts/normal_forms/non_rev_ham.jl b/scripts/normal_forms/non_rev_ham.jl
@@ -0,0 +1,41 @@
+using GeometricMachineLearning, Lux, LinearAlgebra
+import Random, Zygote
+
+τᵋ = Chain(Gradient(2, 10, tanh),
+            Gradient(2, 10, tanh; change_q=false))
+
+Hᵋ = Chain(Dense(2, 10, tanh), Dense(10, 1, use_bias=false))
+
+ps_τ, st_τ = Lux.setup(Random.default_rng(), τᵋ)
+ps_F, st_F = Lux.setup(Random.default_rng(), Hᵋ) 
+
+Fᵋ(z, ps_F) = SymplecticMatrix(1)*(Zygote.gradient(z -> sum(Hᵋ(z, ps_F, st_F)[1]), z)[1])
+
+f(z) = [3*z[1]*z[2]^2, -z[2]^3]
+A = [0. -1.; 1. 0.]
+expA(θ) = [cos(θ) -sin(θ); sin(θ) cos(θ)]
+
+fτᵋ(z, ps_τ) = τᵋ(z, ps_τ, st_τ)[1] |> f 
+#const 
+θpoints = range(0, 2*π, 100)[1:end-1]
+function Πₐfτᵋ(z, ps_τ)
+    output = zero(z)
+    for θ in θpoints
+       output += expA(-θ)*fτᵋ(expA(θ)*z, ps_τ)        
+    end
+    output/(2*π*length(θpoints))
+end
+
+Jτᵋ(z, ps_τ) = Zygote.jacobian(z -> τᵋ(z, ps_τ, st_τ)[1], z)[1]
+
+function ΠₐJτᵋ(z, ps_τ) 
+    output = zeros(2,2)
+    for θ in θpoints
+        output += expA(-θ)*Jτᵋ(expA(θ)*z, ps_τ)
+        #display(output)
+    end
+    output/(2*π*length(θpoints))
+end
+
+loss_1(z, ps_τ, ps_F) = norm(ΠₐJτᵋ(z, ps_τ)*Fᵋ(z, ps_F) - Πₐfτᵋ(z, ps_τ))
+loss_2(z, ps_τ, ps_F, ε=.1) = norm(Jτᵋ(z, ps_τ)*A*z - A*(τᵋ(z, ps_τ, st_τ)[1]) - ε*(fτᵋ(z, ps_τ) - Jτᵋ(z, ps_τ)*Fᵋ(z, ps_F)))
diff --git a/scripts/sympnet_pendulum.jl b/scripts/sympnet_pendulum.jl
@@ -8,6 +8,7 @@ using LinearAlgebra
 using Lux
 using Plots
 using Zygote
+using ProgressMeter
 
 import Random 
 
@@ -18,47 +19,50 @@ q, p = pendulum_data()
 plt = plot(q, p, label="Training data.")
 
 # Sympnet model 
-model = Chain(  Gradient(2, 10, tanh), 
+model = Chain(  Gradient(2, 10, tanh),
+                Gradient(2, 10, tanh; change_q=false),
+                Gradient(2, 10, tanh),
                 Gradient(2, 10, tanh; change_q=false)
-                #Gradient(2, 10, tanh),
-                #Gradient(2, 10, tanh; change_q=false)
                 )
 
 ps, st = Lux.setup(Random.default_rng(), model)
 
+function loss_sing(ps, q, p, index)
+    qp_new = Lux.apply(model, [q[index-1], p[index-1]], ps, st)[1]
+    norm(qp_new - [q[index], p[index]])
+end
+
+
 # defines a loss function 
 function loss(ps, q, p, batch_size=10)
     loss = 0 
     ntime = lastindex(q)
     for i in 1:batch_size
         index = Int(ceil(rand()*ntime))
-        qp_new = Lux.apply(model, [q[index-1], p[index-1]], ps, st)[1]
-        loss += norm(qp_new - [q[index], p[index]])
+        loss += loss_sing(ps, q, p, index)
     end
     loss 
 end
 
 function full_loss(ps, q, p)
     loss = 0 
     for i in 1:lastindex(q)
-        qp_new = Lux.apply(model, [q[i-1], p[i-1]], ps, st)[1]
-        loss += norm(qp_new - [q[i], p[i]])
+        loss += loss_sing(ps, q, p, i)
     end
     loss 
 end 
 
 # define momentum optimizer and initialize
-o = MomentumOptimizer(1e-2, 0.5)
+o = AdamOptimizer()
 # initial gradients for calling Cache constructor
-dp_in = Zygote.gradient(ps -> loss(ps, q, p), ps)[1]
-cache = MomentumOptimizerCache(o, model, ps, dp_in)
+cache = init_optimizer_cache(model, o)
 
 # training 
 println("initial loss: ", full_loss(ps, q, p))
 training_steps = 1000
-for i in 1:training_steps
+@showprogress for i in 1:training_steps
     dp = Zygote.gradient(ps -> loss(ps, q, p), ps)[1]
-    apply!(o, cache, model, ps, dp)
+    optimization_step!(o, model, ps, cache, dp)
 end 
 println("final loss: ", full_loss(ps, q, p))
 

diff --git a/scripts/transformer.jl b/scripts/transformer.jl
@@ -0,0 +1,72 @@
+using GeometricMachineLearning, LinearAlgebra, ProgressMeter
+import Lux, Zygote, Random, MLDatasets, Flux
+
+#MNIST images are 28×28, so a sequence_length of 16 = 4² means the image patches are of size 7² = 49
+image_dim = 28
+patch_length = 7
+n_heads = 7
+n_layers = 5
+patch_number = (image_dim÷patch_length)^2
+
+train_x, train_y = MLDatasets.MNIST(split=:train)[:]
+test_x, test_y = MLDatasets.MNIST(split=:test)[:]
+
+
+#preprocessing steps 
+train_x =   Tuple(map(i -> split_and_flatten(train_x[:,:,i], patch_length), 1:size(train_x,3)))
+test_x =    Tuple(map(i -> split_and_flatten(test_x[:,:,i], patch_length), 1:size(test_x,3)))
+
+#implement this encoding yourself!
+train_y = Flux.onehotbatch(train_y, 0:9) #|> gpu
+test_y = Flux.onehotbatch(test_y, 0:9) #|> gpu
+
+
+#encoder layer - final layer has to be added for evaluation purposes!
+Ψᵉ = Lux.Chain(
+    Transformer(patch_length^2, n_heads, n_layers, Stiefel=true),
+    Lux.Dense(patch_length^2, 10, Lux.σ, use_bias=false)
+)
+
+ps, st = Lux.setup(Random.default_rng(), Ψᵉ) # .|> gpu  
+
+#loss_sing
+function loss_sing(ps, x, y)
+    norm(Lux.apply(Ψᵉ, x, ps, st)[1][:,1] - y)
+end
+function loss_sing(ps, train_x, train_y, index)
+    loss_sing(ps, train_x[index], train_y[:, index])    
+end
+function full_loss(ps, train_x, train_y)
+    num = length(train_x)
+    mapreduce(index -> loss_sing(ps, train_x, train_y, index), +, 1:num)
+end
+
+num = length(train_x)
+batch_size = 64
+training_steps = 1000
+
+o = AdamOptimizer()
+cache = init_optimizer_cache(Ψᵉ, o)
+
+println("initial loss: ", full_loss(ps, train_x, train_y)/num)
+
+@showprogress "Training network ..." for i in 1:training_steps
+    index₁ = Int(ceil(rand()*num))
+    x = train_x[index₁] #|> gpu
+    y = train_y[:, index₁] #|> gpu
+    l, pb = Zygote.pullback(ps -> loss_sing(ps, x, y), ps)
+    dp = pb(one(l))[1]
+    #dp = Zyogte.gradient(ps -> loss_sing(ps, x, y), ps)[1]
+
+    indices = Int.(ceil.(rand(batch_size -1)*num))
+    for index in indices
+        x = train_x[index] #|> gpu
+        y = train_y[:, index] #|> gpu
+        l, pb = Zygote.pullback(ps -> loss_sing(ps, x, y), ps)
+        dp = _add(dp, pb(one(l))[1])
+    end
+    optimization_step!(o, Ψᵉ, ps, cache, dp)
+end
+println("final loss: ", full_loss(ps, train_x, train_y)/num)
+
+println("\nfinal test loss: ", full_loss(ps, test_x, test_y)/length(test_x))