-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrope.tex
118 lines (106 loc) · 5.55 KB
/
wrope.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
\begin{slide}[\slideopts,toc={WRoPE}]{RoPE and WRoPE}
\vspace{-1em}
\begin{itemize}
\item Rotational Positional Encoding (RoPE) owns \emph{one arc direction} along the hypersphere % to \emph{positional encoding}
\mpitem We can thus rotate our vector memory $\hv(n)$ by $\Delta$ radians each time step to ``age'' it:
\[
\hv_a(n) = e^{j\Delta}\hv(n), \quad \mbox{with}\; \Delta = \frac{2\pi}{L}
\]
when our maximum sequence length (before reset) is $L$
\mpitem \textbf{Idea:} ``Warped RoPE'' (WRoPE) for \emph{arbitrarily long sequences} (processed in reverse):
\[
\Delta_n = \frac{2\pi n}{n+L}, \quad n=0,1,2,\ldots
\]
\maybepause
(inspired by the \emph{bilinear transform} used in digital filter design)
\mpitem A \emph{blend} of \emph{uniform} and \emph{warped} rotations can be used:
\[
\Delta_n = \funcalign{\frac{\pi n}{L}}{n=0,1,2,\ldots,L-1}{%
\pi + \frac{\pi n}{n+1}}{n=L,L+1,L+2,\ldots}
\]
where $L$ is now the \emph{typical} sequence length (giving it more
``space'' in recall)
\end{itemize}
\end{slide}
\begin{slide}[\slideopts]{WRoPE Memory}
% \vspace{-1em}
\begin{itemize}
\mpitem WRoPE sequences are naturally reversed because we can only change all stored angles by the same delta:
\[
\hv_a(n) = e^{j\Delta_n}\hv(n), \quad n=0,1,2,\dots
\]
\mpitem This makes inference non-autoregressive (more expensive)
\mpitem One improvement is to \emph{store} past hidden states so that positional encodings can be updated arbitrarily when accessed:
\[
\hv_a(n,m) = e^{j\Delta_{n-m}}\hv(m), \quad m=n-L,\dots,n-1,n
\]
($m$th hidden state vector needed for inference at time $n$)
\mpitem This is the same amount of storage needed for the Truncated Infinite Impulse Response (TIIR) technique
which provides a recursively computed sliding-window of memory
\mpitem In the TIIR case (fixed length $L$), might as well use normal RoPE
\mpitem WRoPE maybe competitive for encoding ``journalistic style'' into a vector
\end{itemize}
\end{slide}
%% NOT FINISHED:
\begin{slide}[\slideopts,toc={}]{Searching a Time Range}
\vspace{-1em}
\begin{itemize}
\mpitem Reserving complex angle for positional encoding $\Leftrightarrow$ \emph{real} vocabulary
embedding vectors
\mpitem Downstream weights and biases must be \emph{complex}
\mpitem Complex angle discarded by absolute value when no longer needed
\mpitem Exact orthogonality is obtained for each of the $L$ RoPE time-steps when there is
no amplitude decay as in normal RNNs, \ie, $e^{jn\Delta}\xv(n)\perp e^{jm\Delta}\xv(n)\, \forall n\ne m$
\mpitem RoPE query vector for a vocabulary vector (``token'') $\wv$ any time between $n_1$ to $n_2$ is
\[
\qv \eqsp \sum_{n=n_1}^{n_2}e^{jn \Delta}\wv
\eqsp \wv \frac{e^{j(n_2+1)\Delta} - e^{jn_1\Delta}}{e^{j\Delta}-1}
\eqsp \wv\, e^{j\theta_{12}}\frac{\sin\left(\frac{n_2-n_1+1}{2}\Delta\right)}{\sin(\Delta/2)}
\]
\vspace{-1em}
\end{itemize}
\end{slide}
\begin{slide}[\slideopts,toc={Compressed Time}]{Compressed Time Retrieval}
\begin{itemize}
\mpitem Recall that a vector $\wv$ is detected in the vector memory $\hv$ by the inner product $\wv^T\xv$
exceeding some detection threshold $b$, \ie, $\wv^T\xv > b$
\mpitem The threshold $b$ can be lowered to capture a wider range of similar vectors
\mpitem The captured vectors lie in a cone at angle $\theta$ centered on the vector $\wv$,
and $\cos(\theta)=b$
\mpitem In WRoPE, the angular rotation decreases as $1/n$ or the like
\mpitem ...
\end{itemize}
This causes inner products to increase as vectors get squeezed together along the complex-angle arc.
\end{slide}
\begin{slide}[\slideopts,toc={Reservations}]{Reserving Angular Dimensions}
\vspace{-1em}
Reserving the radial dimension looks easy (\eg, \tx{RMSNorm} every $k$ gradient-descent steps)\\
How to reserve an \emph{arc dimension} for [W]RoPE and perhaps other purposes?
\begin{itemize}
% \mpitem \emph{Augment the space with new angular dimensions:}
% \begin{enumerate}
\mpitem RoPE: Effectively reserves \emph{complex angle:} $\hv_a(n) = e^{j\Delta}\hv(n)$\\
(data size doubled: $\xv \to (\xv,\zv)$)
\mpitem A zero imaginary part in the vocabulary embedding reserves complex angle % for [W]RoPE
\mpitem \emph{Quaternions} give two more angles
$\Leftrightarrow$ embedding size quadrupled: $\xv \to (\xv,\zv,\zv,\zv)$\\
\begin{itemize}
\mpitem $\hv_a(n) = \qb\,\hv(n)$, where, from
\htmladdnormallink{Wikipedia}{https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation}:\\
$
% \displaystyle
\qb = e^{{\frac {\Delta}{2}}{(\av_x\ib +\av_y\jb +\av_z\kb )}}
= \cos {\frac {\Delta }{2}}+(\av_x\ib +\av_y\jb + \av_z\kb )\sin {\frac {\Delta }{2}}
= \cos {\frac {\Delta }{2}}+\av\sin {\frac {\Delta }{2}}
$
\end{itemize}
\mpitem See the Cayley-Dickson Construction for $2^k-1$ angles, $k=1,2,3,\ldots$
\mpitem Trivial in \emph{polar coordinates} (``write-protect'' reserved dimensions during gradient step),\\
but then back-propagation must be rewritten for polar coordinates
% \\ Replace weight matrices $\Wmtx$ with \emph{rotation matrices?}
\mpitem Find a suitable method of \emph{constrained gradient descent}
\mpitem Train (or calculate) a \emph{pointwise MLP} that ``de-rotates'' $\xv+\mu\gradv$ given also $\xv$
\mpitem Use \emph{reserved translational dimension(s)} instead (``TraPE'') (omitted from \tx{RMSNorm})
% Illustrate "meta" reserved translation dimension: \myFigureToWidth{threeGinaSpheres}{0.5\twidth}{}
\end{itemize}
\end{slide}