-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathBlake2bMinerCore.vhd
252 lines (219 loc) · 10.8 KB
/
Blake2bMinerCore.vhd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
-- Copyright (c) 2017, Pedro Rivera, all rights reserved.
--
-- === Blake2bMinerCore.vhd ===
--
-- This is a fully unrolled pipeline implementation of
-- the Blake2b hashing algorithm cut and optimized for Siacoin mining. It
-- takes 96 cycles to fill the pipeline when using 4-step MixG. For Blake2b
-- reference take a look at:
-- https://tools.ietf.org/html/rfc7693#section-3.2
-- https://en.wikipedia.org/wiki/BLAKE_(hash_function)
-- Reset Philosophy: Resets are avoided to let Vivado do whatever it wishes
-- with resets and keep those issues out of the way when trying to fill the
-- FPGA with cores, so...
-- - How does it reset the state of the pipeline registers when new work is pushed?
-- There is no need, just changing the work data (BlockHeader) itself will produce a valid
-- result after it goes through the full length of the pipeline. The preceding stages of
-- the pipeline may give invalid results because the new work data is shared with all
-- but the duration of this is negligible (~96 cycles) and unlikely to solve the puzzle.
-- - How does it stay idle without burning a ton of power while not being operated?
-- Simply keeping the inputs constant, will keep the flip flops in the pipeline
-- at a steady state once the constant input propagates through.
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library work;
use work.PkgBlake2b.all;
entity Blake2bMinerCore is
generic(
kNonceSeed : unsigned(47 downto 0) := (others => '1') -- So first round increments and starts at 0
);
port(
Clk : in std_logic;
-- Enables the nonce generator. When low, resets to seed value.
Enable : in std_logic;
-- Message input (80-byte block header data, includes target)
BlockHeader : in U64Array_t(9 downto 0);
-- Result, only valid when Success is true.
NonceOut : out unsigned(63 downto 0); -- Default assignment?
-- Indicates target was met.
Success : out std_logic := '0'
);
end Blake2bMinerCore;
architecture rtl of Blake2bMinerCore is
constant kGPerMixer : integer := 4;
constant kMixRounds : integer := 12;
constant kPipeLength: integer := 4*2*12; -- 4 clks * 2 mixers * 12 rounds
type U64Array2D_t is array (integer range <>) of U64Array_t(kGPerMixer-1 downto 0);
type U48Array_t is array (integer range <>) of unsigned(47 downto 0);
signal Msg : U64Array_t(15 downto 0) := (others => kU64Zeros);
signal Hash0, Hash0_be, A2_out_dly, Target : unsigned(63 downto 0) := kU64Zeros;
signal A1_in, B1_in, C1_in, D1_in, X1, Y1 : U64Array2D_t(kMixRounds-1 downto 0) := (others => (others => kU64Zeros));
signal A2_in, B2_in, C2_in, D2_in, X2, Y2 : U64Array2D_t(kMixRounds-1 downto 0) := (others => (others => kU64Zeros));
signal A1_out, B1_out, C1_out, D1_out : U64Array2D_t(kMixRounds-1 downto 0) := (others => (others => kU64Zeros));
signal A2_out, B2_out, C2_out, D2_out : U64Array2D_t(kMixRounds-1 downto 0) := (others => (others => kU64Zeros));
signal Nonce : U48Array_t(kMixRounds-1 downto 0);-- := (others => kU64Zeros); TODO: verify not initializing is beneficial
begin
----------------------------------------------------------------------------------------
-- Nonce Generator & Message Feed
----------------------------------------------------------------------------------------
-- Option A: Have 12x ~48-bit counters hooked up to each corresponding 'X' or 'Y' input.
-- 38 bits allow counting for 10 min @ 400 MHz. 10 additional bits allow having 1024 cores.
-- Option B: Have 96x ~48-bit registers shifting constantly (3,800 FF).
-- Or do with BRAM FIFOs.
-- Generated from the Sigma message schedule, the following table identifies which clock
-- cycles of the pipeline have the nonce as an input (4). Differentiating between X or Y
-- is useful because the 'Y' input is used 2 cycles later in the MixG function, thus
-- subtracting 2 from the counter removes the need for buffering the corresponding nonce.
-- X Y X Y X Y X Y
-- 00 01 02 03 (4) 05 06 07 Clk 0 <-
-- 08 09 10 11 12 13 14 15 Clk 4
-- 14 10 (4) 08 09 15 13 06 Clk 8 <-
-- 01 12 00 02 11 07 05 03 Clk 12
-- 11 08 12 00 05 02 15 13 Clk 16
-- 10 14 03 06 07 01 09 (4) Clk 20 <-
-- 07 09 03 01 13 12 11 14 Clk 24
-- 02 06 05 10 (4) 00 15 08 Clk 28 <-
-- 09 00 05 07 02 (4) 10 15 Clk 32 <-
-- 14 01 11 12 06 08 03 13 Clk 36
-- 02 12 06 10 00 11 08 03 Clk 40
-- (4) 13 07 05 15 14 01 09 Clk 44 <-
-- 12 05 01 15 14 13 (4) 10 Clk 48 <-
-- 00 07 06 03 09 02 08 11 Clk 52
-- 13 11 07 14 12 01 03 09 Clk 56
-- 05 00 15 (4) 08 06 02 10 Clk 60 <-
-- 06 15 14 09 11 03 00 08 Clk 64
-- 12 02 13 07 01 (4) 10 05 Clk 68 <-
-- 10 02 08 (4) 07 06 01 05 Clk 72 <-
-- 15 11 09 14 03 12 13 00 Clk 76
-- 00 01 02 03 (4) 05 06 07 Clk 80 <-
-- 08 09 10 11 12 13 14 15 Clk 84
-- 14 10 (4) 08 09 15 13 06 Clk 88 <-
-- 01 12 00 02 11 07 05 03 Clk 92
-- Block data is only 80 bytes long (10x 64-bit words)
Msg(9 downto 0) <= BlockHeader;
Msg(15 downto 10) <= (others => (others =>'0'));
RNG: process(Clk)
begin
if rising_edge(Clk) then
if Enable = '0' then
-- Initial offsets
Nonce(0) <= kNonceSeed;
Nonce(1) <= kNonceSeed - 8;
Nonce(2) <= kNonceSeed - 20 - 2; -- Y
Nonce(3) <= kNonceSeed - 28;
Nonce(4) <= kNonceSeed - 32 - 2; -- Y
Nonce(5) <= kNonceSeed - 44;
Nonce(6) <= kNonceSeed - 48;
Nonce(7) <= kNonceSeed - 60 - 2; -- Y
Nonce(8) <= kNonceSeed - 68 - 2; -- Y
Nonce(9) <= kNonceSeed - 72 - 2; -- Y
Nonce(10) <= kNonceSeed - 80;
Nonce(11) <= kNonceSeed - 88;
else
for i in 0 to 11 loop
Nonce(i) <= Nonce(i) + 1;
end loop;
end if;
end if;
end process;
-------------------------------------------------------------------------------------------
-- Mixer instatiation
-------------------------------------------------------------------------------------------
-- These series of for-generate loops wire all the stages of the pipeline statically.
-- Initialize the mix vector
A1_in(0)(0) <= kHin(0); -- V0
A1_in(0)(1) <= kHin(1); -- V1
A1_in(0)(2) <= kHin(2); -- V2
A1_in(0)(3) <= kHin(3); -- V3
B1_in(0)(0) <= kHin(4); -- V4
B1_in(0)(1) <= kHin(5); -- V5
B1_in(0)(2) <= kHin(6); -- V6
B1_in(0)(3) <= kHin(7); -- V7
C1_in(0)(0) <= kIV(0); -- V8
C1_in(0)(1) <= kIV(1); -- V9
C1_in(0)(2) <= kIV(2); -- V10
C1_in(0)(3) <= kIV(3); -- V11
D1_in(0)(0) <= kIV(4) xor (x"00000000000000" & kMsgLen); -- V12
D1_in(0)(1) <= kIV(5); -- V13
D1_in(0)(2) <= not kIV(6); -- V14 (last block, so invert)
D1_in(0)(3) <= kIV(7); -- V15
RoundGen: for i in 0 to kMixRounds-1 generate
-- Mesage feed.
MsgFeedGen: for j in 0 to 3 generate
X1(i)(j) <= Msg(kSigma(i mod 10, 2*j)) when kSigma(i mod 10, 2*j) /= 4 else x"0000" & Nonce(i);
Y1(i)(j) <= Msg(kSigma(i mod 10, 2*j+1)) when kSigma(i mod 10, 2*j+1) /= 4 else x"0000" & Nonce(i);
X2(i)(j) <= Msg(kSigma(i mod 10, 2*j+8)) when kSigma(i mod 10, 2*j+8) /= 4 else x"0000" & Nonce(i);
Y2(i)(j) <= Msg(kSigma(i mod 10, 2*j+9)) when kSigma(i mod 10, 2*j+9) /= 4 else x"0000" & Nonce(i);
end generate;
Mixer1: entity work.QuadG
port map(
Clk => Clk,
A_in => A1_in(i),
B_in => B1_in(i),
C_in => C1_in(i),
D_in => D1_in(i),
X => X1(i),
Y => Y1(i),
A_out => A1_out(i),
B_out => B1_out(i),
C_out => C1_out(i),
D_out => D1_out(i)
);
A2_in(i) <= A1_out(i);
B2_in(i) <= B1_out(i)(0) & B1_out(i)(3 downto 1); -- ror 1
C2_in(i) <= C1_out(i)(1 downto 0) & C1_out(i)(3 downto 2); -- ror 2
D2_in(i) <= D1_out(i)(2 downto 0) & D1_out(i)(3); -- ror 3
Mixer2: entity work.QuadG
port map(
Clk => Clk,
A_in => A2_in(i),
B_in => B2_in(i),
C_in => C2_in(i),
D_in => D2_in(i),
X => X2(i),
Y => Y2(i),
A_out => A2_out(i),
B_out => B2_out(i),
C_out => C2_out(i),
D_out => D2_out(i)
);
AllButLastRound: if i /= kMixRounds-1 generate
A1_in(i+1) <= A2_out(i);
B1_in(i+1) <= B2_out(i)(2 downto 0) & B2_out(i)(3); -- rol 1
C1_in(i+1) <= C2_out(i)(1 downto 0) & C2_out(i)(3 downto 2); -- rol 2
D1_in(i+1) <= D2_out(i)(0) & D2_out(i)(3 downto 1); -- rol 3
end generate;
end generate RoundGen;
-- Only Hout(0) from the 12th Round is needed because it represents the most significant word
-- of the hash. Hash(0) = H0 xor V0 xor V8 where V0 is A2_out(last)(0), V8 is C2_out(last)(2).
-- But because of the pipeline, these two are delayed 1 or 2 clock cycles depending on the MixG
-- implementation. To do a coherent operation, A2_out (which comes out first) will be delayed
-- N cycles. Currently using 4-step MixG, hence the 1 cycle delay:
DelayV0: process(Clk)
begin
if rising_edge(Clk) then
A2_out_dly <= A2_out(kMixRounds-1)(0);
end if;
end process;
Hash0 <= kHin(0) xor A2_out_dly xor C2_out(kMixRounds-1)(2);
-- Sia represents the hash in big endian, so reverse the bytes
Hash0_be <= Hash0(7 downto 0) & Hash0(15 downto 8) & Hash0(23 downto 16) & Hash0(31 downto 24) &
Hash0(39 downto 32) & Hash0(47 downto 40) & Hash0(55 downto 48) & Hash0(63 downto 56);
----------------------------------------------------------
-- Target Comparison
----------------------------------------------------------
Target <= BlockHeader(4); -- Word 4 contains the target
Verify: process(Clk)
begin
if rising_edge(Clk) then
if Hash0_be < Target then -- [!] Could report false positives while pipe isn't fully initialized
Success <='1';
NonceOut <= x"0000" & (Nonce(0) - (kPipeLength + 1)); -- Corresponding nonce
else
Success <='0';
NonceOut <= (others => '0');
end if;
end if;
end process;
end rtl;