sha2.lua 270 KB


  1. --------------------------------------------------------------------------------------------------------------------------
  2. -- sha2.lua
  3. --------------------------------------------------------------------------------------------------------------------------
  4. -- VERSION: 12 (2022-02-23)
  5. -- AUTHOR: Egor Skriptunoff
  6. -- LICENSE: MIT (the same license as Lua itself)
  7. -- URL: https://github.com/Egor-Skriptunoff/pure_lua_SHA
  8. --
  9. -- DESCRIPTION:
  10. -- This module contains functions to calculate SHA digest:
  11. -- MD5, SHA-1,
  12. -- SHA-224, SHA-256, SHA-512/224, SHA-512/256, SHA-384, SHA-512,
  13. -- SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256,
  14. -- HMAC,
  15. -- BLAKE2b, BLAKE2s, BLAKE2bp, BLAKE2sp, BLAKE2Xb, BLAKE2Xs,
  16. -- BLAKE3, BLAKE3_KDF
  17. -- Written in pure Lua.
  18. -- Compatible with:
  19. -- Lua 5.1, Lua 5.2, Lua 5.3, Lua 5.4, Fengari, LuaJIT 2.0/2.1 (any CPU endianness).
  20. -- Main feature of this module: it was heavily optimized for speed.
  21. -- For every Lua version the module contains particular implementation branch to get benefits from version-specific features.
  22. -- - branch for Lua 5.1 (emulating bitwise operators using look-up table)
  23. -- - branch for Lua 5.2 (using bit32/bit library), suitable for both Lua 5.2 with native "bit32" and Lua 5.1 with external library "bit"
  24. -- - branch for Lua 5.3/5.4 (using native 64-bit bitwise operators)
  25. -- - branch for Lua 5.3/5.4 (using native 32-bit bitwise operators) for Lua built with LUA_INT_TYPE=LUA_INT_INT
  26. -- - branch for LuaJIT without FFI library (useful in a sandboxed environment)
  27. -- - branch for LuaJIT x86 without FFI library (LuaJIT x86 has oddity because of lack of CPU registers)
  28. -- - branch for LuaJIT 2.0 with FFI library (bit.* functions work only with Lua numbers)
  29. -- - branch for LuaJIT 2.1 with FFI library (bit.* functions can work with "int64_t" arguments)
  30. --
  31. --
  32. -- USAGE:
  33. -- Input data should be provided as a binary string: either as a whole string or as a sequence of substrings (chunk-by-chunk loading, total length < 9*10^15 bytes).
  34. -- Result (SHA digest) is returned in hexadecimal representation as a string of lowercase hex digits.
  35. -- Simplest usage example:
  36. -- local sha = require("sha2")
  37. -- local your_hash = sha.sha256("your string")
  38. -- See file "sha2_test.lua" for more examples.
  39. --
  40. --
  41. -- CHANGELOG:
  42. -- version date description
  43. -- ------- ---------- -----------
  44. -- 12 2022-02-23 Now works in Luau (but NOT optimized for speed)
  45. -- 11 2022-01-09 BLAKE3 added
  46. -- 10 2022-01-02 BLAKE2 functions added
  47. -- 9 2020-05-10 Now works in OpenWrt's Lua (dialect of Lua 5.1 with "double" + "invisible int32")
  48. -- 8 2019-09-03 SHA-3 functions added
  49. -- 7 2019-03-17 Added functions to convert to/from base64
  50. -- 6 2018-11-12 HMAC added
  51. -- 5 2018-11-10 SHA-1 added
  52. -- 4 2018-11-03 MD5 added
  53. -- 3 2018-11-02 Bug fixed: incorrect hashing of long (2 GByte) data streams on Lua 5.3/5.4 built with "int32" integers
  54. -- 2 2018-10-07 Decreased module loading time in Lua 5.1 implementation branch (thanks to Peter Melnichenko for giving a hint)
  55. -- 1 2018-10-06 First release (only SHA-2 functions)
  56. -----------------------------------------------------------------------------
  57. local print_debug_messages = false -- set to true to view some messages about your system's abilities and implementation branch chosen for your system
  58. local unpack, table_concat, byte, char, string_rep, sub, gsub, gmatch, string_format, floor, ceil, math_min, math_max, tonumber, type, math_huge =
  59. table.unpack or unpack, table.concat, string.byte, string.char, string.rep, string.sub, string.gsub, string.gmatch, string.format, math.floor, math.ceil, math.min, math.max, tonumber, type, math.huge
  60. --------------------------------------------------------------------------------
  61. -- EXAMINING YOUR SYSTEM
  62. --------------------------------------------------------------------------------
  63. local function get_precision(one)
  64. -- "one" must be either float 1.0 or integer 1
  65. -- returns bits_precision, is_integer
  66. -- This function works correctly with all floating point datatypes (including non-IEEE-754)
  67. local k, n, m, prev_n = 0, one, one
  68. while true do
  69. k, prev_n, n, m = k + 1, n, n + n + 1, m + m + k % 2
  70. if k > 256 or n - (n - 1) ~= 1 or m - (m - 1) ~= 1 or n == m then
  71. return k, false -- floating point datatype
  72. elseif n == prev_n then
  73. return k, true -- integer datatype
  74. end
  75. end
  76. end
  77. -- Make sure Lua has "double" numbers
  78. local x = 2/3
  79. local Lua_has_double = x * 5 > 3 and x * 4 < 3 and get_precision(1.0) >= 53
  80. assert(Lua_has_double, "at least 53-bit floating point numbers are required")
  81. -- Q:
  82. -- SHA2 was designed for FPU-less machines.
  83. -- So, why floating point numbers are needed for this module?
  84. -- A:
  85. -- 53-bit "double" numbers are useful to calculate "magic numbers" used in SHA.
  86. -- I prefer to write 50 LOC "magic numbers calculator" instead of storing more than 200 constants explicitly in this source file.
  87. local int_prec, Lua_has_integers = get_precision(1)
  88. local Lua_has_int64 = Lua_has_integers and int_prec == 64
  89. local Lua_has_int32 = Lua_has_integers and int_prec == 32
  90. assert(Lua_has_int64 or Lua_has_int32 or not Lua_has_integers, "Lua integers must be either 32-bit or 64-bit")
  91. -- Q:
  92. -- Does it mean that almost all non-standard configurations are not supported?
  93. -- A:
  94. -- Yes. Sorry, too many problems to support all possible Lua numbers configurations.
  95. -- Lua 5.1/5.2 with "int32" will not work.
  96. -- Lua 5.1/5.2 with "int64" will not work.
  97. -- Lua 5.1/5.2 with "int128" will not work.
  98. -- Lua 5.1/5.2 with "float" will not work.
  99. -- Lua 5.1/5.2 with "double" is OK. (default config for Lua 5.1, Lua 5.2, LuaJIT)
  100. -- Lua 5.3/5.4 with "int32" + "float" will not work.
  101. -- Lua 5.3/5.4 with "int64" + "float" will not work.
  102. -- Lua 5.3/5.4 with "int128" + "float" will not work.
  103. -- Lua 5.3/5.4 with "int32" + "double" is OK. (config used by Fengari)
  104. -- Lua 5.3/5.4 with "int64" + "double" is OK. (default config for Lua 5.3, Lua 5.4)
  105. -- Lua 5.3/5.4 with "int128" + "double" will not work.
  106. -- Using floating point numbers better than "double" instead of "double" is OK (non-IEEE-754 floating point implementation are allowed).
  107. -- Using "int128" instead of "int64" is not OK: "int128" would require different branch of implementation for optimized SHA512.
  108. -- Check for LuaJIT and 32-bit bitwise libraries
  109. local is_LuaJIT = ({false, [1] = true})[1] and _VERSION ~= "Luau" and (type(jit) ~= "table" or jit.version_num >= 20000) -- LuaJIT 1.x.x and Luau are treated as vanilla Lua 5.1/5.2
  110. local is_LuaJIT_21 -- LuaJIT 2.1+
  111. local LuaJIT_arch
  112. local ffi -- LuaJIT FFI library (as a table)
  113. local b -- 32-bit bitwise library (as a table)
  114. local library_name
  115. if is_LuaJIT then
  116. -- Assuming "bit" library is always available on LuaJIT
  117. b = require"bit"
  118. library_name = "bit"
  119. -- "ffi" is intentionally disabled on some systems for safety reason
  120. local LuaJIT_has_FFI, result = pcall(require, "ffi")
  121. if LuaJIT_has_FFI then
  122. ffi = result
  123. end
  124. is_LuaJIT_21 = not not loadstring"b=0b0"
  125. LuaJIT_arch = type(jit) == "table" and jit.arch or ffi and ffi.arch or nil
  126. else
  127. -- For vanilla Lua, "bit"/"bit32" libraries are searched in global namespace only. No attempt is made to load a library if it's not loaded yet.
  128. for _, libname in ipairs(_VERSION == "Lua 5.2" and {"bit32", "bit"} or {"bit", "bit32"}) do
  129. if type(_G[libname]) == "table" and _G[libname].bxor then
  130. b = _G[libname]
  131. library_name = libname
  132. break
  133. end
  134. end
  135. end
  136. --------------------------------------------------------------------------------
  137. -- You can disable here some of your system's abilities (for testing purposes)
  138. --------------------------------------------------------------------------------
  139. -- is_LuaJIT = nil
  140. -- is_LuaJIT_21 = nil
  141. -- ffi = nil
  142. -- Lua_has_int32 = nil
  143. -- Lua_has_int64 = nil
  144. -- b, library_name = nil
  145. --------------------------------------------------------------------------------
  146. if print_debug_messages then
  147. -- Printing list of abilities of your system
  148. print("Abilities:")
  149. print(" Lua version: "..(is_LuaJIT and "LuaJIT "..(is_LuaJIT_21 and "2.1 " or "2.0 ")..(LuaJIT_arch or "")..(ffi and " with FFI" or " without FFI") or _VERSION))
  150. print(" Integer bitwise operators: "..(Lua_has_int64 and "int64" or Lua_has_int32 and "int32" or "no"))
  151. print(" 32-bit bitwise library: "..(library_name or "not found"))
  152. end
  153. -- Selecting the most suitable implementation for given set of abilities
  154. local method, branch
  155. if is_LuaJIT and ffi then
  156. method = "Using 'ffi' library of LuaJIT"
  157. branch = "FFI"
  158. elseif is_LuaJIT then
  159. method = "Using special code for sandboxed LuaJIT (no FFI)"
  160. branch = "LJ"
  161. elseif Lua_has_int64 then
  162. method = "Using native int64 bitwise operators"
  163. branch = "INT64"
  164. elseif Lua_has_int32 then
  165. method = "Using native int32 bitwise operators"
  166. branch = "INT32"
  167. elseif library_name then -- when bitwise library is available (Lua 5.2 with native library "bit32" or Lua 5.1 with external library "bit")
  168. method = "Using '"..library_name.."' library"
  169. branch = "LIB32"
  170. else
  171. method = "Emulating bitwise operators using look-up table"
  172. branch = "EMUL"
  173. end
  174. if print_debug_messages then
  175. -- Printing the implementation selected to be used on your system
  176. print("Implementation selected:")
  177. print(" "..method)
  178. end
  179. --------------------------------------------------------------------------------
  180. -- BASIC 32-BIT BITWISE FUNCTIONS
  181. --------------------------------------------------------------------------------
  182. local AND, OR, XOR, SHL, SHR, ROL, ROR, NOT, NORM, HEX, XOR_BYTE
  183. -- Only low 32 bits of function arguments matter, high bits are ignored
  184. -- The result of all functions (except HEX) is an integer inside "correct range":
  185. -- for "bit" library: (-2^31)..(2^31-1)
  186. -- for "bit32" library: 0..(2^32-1)
  187. if branch == "FFI" or branch == "LJ" or branch == "LIB32" then
  188. -- Your system has 32-bit bitwise library (either "bit" or "bit32")
  189. AND = b.band -- 2 arguments
  190. OR = b.bor -- 2 arguments
  191. XOR = b.bxor -- 2..5 arguments
  192. SHL = b.lshift -- second argument is integer 0..31
  193. SHR = b.rshift -- second argument is integer 0..31
  194. ROL = b.rol or b.lrotate -- second argument is integer 0..31
  195. ROR = b.ror or b.rrotate -- second argument is integer 0..31
  196. NOT = b.bnot -- only for LuaJIT
  197. NORM = b.tobit -- only for LuaJIT
  198. HEX = b.tohex -- returns string of 8 lowercase hexadecimal digits
  199. assert(AND and OR and XOR and SHL and SHR and ROL and ROR and NOT, "Library '"..library_name.."' is incomplete")
  200. XOR_BYTE = XOR -- XOR of two bytes (0..255)
  201. elseif branch == "EMUL" then
  202. -- Emulating 32-bit bitwise operations using 53-bit floating point arithmetic
  203. function SHL(x, n)
  204. return (x * 2^n) % 2^32
  205. end
  206. function SHR(x, n)
  207. x = x % 2^32 / 2^n
  208. return x - x % 1
  209. end
  210. function ROL(x, n)
  211. x = x % 2^32 * 2^n
  212. local r = x % 2^32
  213. return r + (x - r) / 2^32
  214. end
  215. function ROR(x, n)
  216. x = x % 2^32 / 2^n
  217. local r = x % 1
  218. return r * 2^32 + (x - r)
  219. end
  220. local AND_of_two_bytes = {[0] = 0} -- look-up table (256*256 entries)
  221. local idx = 0
  222. for y = 0, 127 * 256, 256 do
  223. for x = y, y + 127 do
  224. x = AND_of_two_bytes[x] * 2
  225. AND_of_two_bytes[idx] = x
  226. AND_of_two_bytes[idx + 1] = x
  227. AND_of_two_bytes[idx + 256] = x
  228. AND_of_two_bytes[idx + 257] = x + 1
  229. idx = idx + 2
  230. end
  231. idx = idx + 256
  232. end
  233. local function and_or_xor(x, y, operation)
  234. -- operation: nil = AND, 1 = OR, 2 = XOR
  235. local x0 = x % 2^32
  236. local y0 = y % 2^32
  237. local rx = x0 % 256
  238. local ry = y0 % 256
  239. local res = AND_of_two_bytes[rx + ry * 256]
  240. x = x0 - rx
  241. y = (y0 - ry) / 256
  242. rx = x % 65536
  243. ry = y % 256
  244. res = res + AND_of_two_bytes[rx + ry] * 256
  245. x = (x - rx) / 256
  246. y = (y - ry) / 256
  247. rx = x % 65536 + y % 256
  248. res = res + AND_of_two_bytes[rx] * 65536
  249. res = res + AND_of_two_bytes[(x + y - rx) / 256] * 16777216
  250. if operation then
  251. res = x0 + y0 - operation * res
  252. end
  253. return res
  254. end
  255. function AND(x, y)
  256. return and_or_xor(x, y)
  257. end
  258. function OR(x, y)
  259. return and_or_xor(x, y, 1)
  260. end
  261. function XOR(x, y, z, t, u) -- 2..5 arguments
  262. if z then
  263. if t then
  264. if u then
  265. t = and_or_xor(t, u, 2)
  266. end
  267. z = and_or_xor(z, t, 2)
  268. end
  269. y = and_or_xor(y, z, 2)
  270. end
  271. return and_or_xor(x, y, 2)
  272. end
  273. function XOR_BYTE(x, y)
  274. return x + y - 2 * AND_of_two_bytes[x + y * 256]
  275. end
  276. end
  277. HEX = HEX
  278. or
  279. pcall(string_format, "%x", 2^31) and
  280. function (x) -- returns string of 8 lowercase hexadecimal digits
  281. return string_format("%08x", x % 4294967296)
  282. end
  283. or
  284. function (x) -- for OpenWrt's dialect of Lua
  285. return string_format("%08x", (x + 2^31) % 2^32 - 2^31)
  286. end
  287. local function XORA5(x, y)
  288. return XOR(x, y or 0xA5A5A5A5) % 4294967296
  289. end
  290. local function create_array_of_lanes()
  291. return {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  292. end
  293. --------------------------------------------------------------------------------
  294. -- CREATING OPTIMIZED INNER LOOP
  295. --------------------------------------------------------------------------------
  296. -- Inner loop functions
  297. local sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
  298. -- Arrays of SHA-2 "magic numbers" (in "INT64" and "FFI" branches "*_lo" arrays contain 64-bit values)
  299. local sha2_K_lo, sha2_K_hi, sha2_H_lo, sha2_H_hi, sha3_RC_lo, sha3_RC_hi = {}, {}, {}, {}, {}, {}
  300. local sha2_H_ext256 = {[224] = {}, [256] = sha2_H_hi}
  301. local sha2_H_ext512_lo, sha2_H_ext512_hi = {[384] = {}, [512] = sha2_H_lo}, {[384] = {}, [512] = sha2_H_hi}
  302. local md5_K, md5_sha1_H = {}, {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0}
  303. local md5_next_shift = {0, 0, 0, 0, 0, 0, 0, 0, 28, 25, 26, 27, 0, 0, 10, 9, 11, 12, 0, 15, 16, 17, 18, 0, 20, 22, 23, 21}
  304. local HEX64, lanes_index_base -- defined only for branches that internally use 64-bit integers: "INT64" and "FFI"
  305. local common_W = {} -- temporary table shared between all calculations (to avoid creating new temporary table every time)
  306. local common_W_blake2b, common_W_blake2s, v_for_blake2s_feed_64 = common_W, common_W, {}
  307. local K_lo_modulo, hi_factor, hi_factor_keccak = 4294967296, 0, 0
  308. local sigma = {
  309. { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
  310. { 15, 11, 5, 9, 10, 16, 14, 7, 2, 13, 1, 3, 12, 8, 6, 4 },
  311. { 12, 9, 13, 1, 6, 3, 16, 14, 11, 15, 4, 7, 8, 2, 10, 5 },
  312. { 8, 10, 4, 2, 14, 13, 12, 15, 3, 7, 6, 11, 5, 1, 16, 9 },
  313. { 10, 1, 6, 8, 3, 5, 11, 16, 15, 2, 12, 13, 7, 9, 4, 14 },
  314. { 3, 13, 7, 11, 1, 12, 9, 4, 5, 14, 8, 6, 16, 15, 2, 10 },
  315. { 13, 6, 2, 16, 15, 14, 5, 11, 1, 8, 7, 4, 10, 3, 9, 12 },
  316. { 14, 12, 8, 15, 13, 2, 4, 10, 6, 1, 16, 5, 9, 7, 3, 11 },
  317. { 7, 16, 15, 10, 12, 4, 1, 9, 13, 3, 14, 8, 2, 5, 11, 6 },
  318. { 11, 3, 9, 5, 8, 7, 2, 6, 16, 12, 10, 15, 4, 13, 14, 1 },
  319. }; sigma[11], sigma[12] = sigma[1], sigma[2]
  320. local perm_blake3 = {
  321. 1, 3, 4, 11, 13, 10, 12, 6,
  322. 1, 3, 4, 11, 13, 10,
  323. 2, 7, 5, 8, 14, 15, 16, 9,
  324. 2, 7, 5, 8, 14, 15,
  325. }
  326. local function build_keccak_format(elem)
  327. local keccak_format = {}
  328. for _, size in ipairs{1, 9, 13, 17, 18, 21} do
  329. keccak_format[size] = "<"..string_rep(elem, size)
  330. end
  331. return keccak_format
  332. end
  333. if branch == "FFI" then
  334. local common_W_FFI_int32 = ffi.new("int32_t[?]", 80) -- 64 is enough for SHA256, but 80 is needed for SHA-1
  335. common_W_blake2s = common_W_FFI_int32
  336. v_for_blake2s_feed_64 = ffi.new("int32_t[?]", 16)
  337. perm_blake3 = ffi.new("uint8_t[?]", #perm_blake3 + 1, 0, unpack(perm_blake3))
  338. for j = 1, 10 do
  339. sigma[j] = ffi.new("uint8_t[?]", #sigma[j] + 1, 0, unpack(sigma[j]))
  340. end; sigma[11], sigma[12] = sigma[1], sigma[2]
  341. -- SHA256 implementation for "LuaJIT with FFI" branch
  342. function sha256_feed_64(H, str, offs, size)
  343. -- offs >= 0, size >= 0, size is multiple of 64
  344. local W, K = common_W_FFI_int32, sha2_K_hi
  345. for pos = offs, offs + size - 1, 64 do
  346. for j = 0, 15 do
  347. pos = pos + 4
  348. local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
  349. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  350. end
  351. for j = 16, 63 do
  352. local a, b = W[j-15], W[j-2]
  353. W[j] = NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16] )
  354. end
  355. local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  356. for j = 0, 63, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
  357. local z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j] + K[j+1] + h) )
  358. h, g, f, e = g, f, e, NORM( d + z )
  359. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  360. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+1] + K[j+2] + h) )
  361. h, g, f, e = g, f, e, NORM( d + z )
  362. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  363. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+2] + K[j+3] + h) )
  364. h, g, f, e = g, f, e, NORM( d + z )
  365. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  366. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+3] + K[j+4] + h) )
  367. h, g, f, e = g, f, e, NORM( d + z )
  368. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  369. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+4] + K[j+5] + h) )
  370. h, g, f, e = g, f, e, NORM( d + z )
  371. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  372. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+5] + K[j+6] + h) )
  373. h, g, f, e = g, f, e, NORM( d + z )
  374. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  375. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+6] + K[j+7] + h) )
  376. h, g, f, e = g, f, e, NORM( d + z )
  377. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  378. z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+7] + K[j+8] + h) )
  379. h, g, f, e = g, f, e, NORM( d + z )
  380. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  381. end
  382. H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  383. H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
  384. end
  385. end
  386. local common_W_FFI_int64 = ffi.new("int64_t[?]", 80)
  387. common_W_blake2b = common_W_FFI_int64
  388. local int64 = ffi.typeof"int64_t"
  389. local int32 = ffi.typeof"int32_t"
  390. local uint32 = ffi.typeof"uint32_t"
  391. hi_factor = int64(2^32)
  392. if is_LuaJIT_21 then -- LuaJIT 2.1 supports bitwise 64-bit operations
  393. local AND64, OR64, XOR64, NOT64, SHL64, SHR64, ROL64, ROR64 -- introducing synonyms for better code readability
  394. = AND, OR, XOR, NOT, SHL, SHR, ROL, ROR
  395. HEX64 = HEX
  396. -- BLAKE2b implementation for "LuaJIT 2.1 + FFI" branch
  397. do
  398. local v = ffi.new("int64_t[?]", 16)
  399. local W = common_W_blake2b
  400. local function G(a, b, c, d, k1, k2)
  401. local va, vb, vc, vd = v[a], v[b], v[c], v[d]
  402. va = W[k1] + (va + vb)
  403. vd = ROR64(XOR64(vd, va), 32)
  404. vc = vc + vd
  405. vb = ROR64(XOR64(vb, vc), 24)
  406. va = W[k2] + (va + vb)
  407. vd = ROR64(XOR64(vd, va), 16)
  408. vc = vc + vd
  409. vb = ROL64(XOR64(vb, vc), 1)
  410. v[a], v[b], v[c], v[d] = va, vb, vc, vd
  411. end
  412. function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  413. -- offs >= 0, size >= 0, size is multiple of 128
  414. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  415. for pos = offs, offs + size - 1, 128 do
  416. if str then
  417. for j = 1, 16 do
  418. pos = pos + 8
  419. local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)
  420. W[j] = XOR64(OR(SHL(h, 24), SHL(g, 16), SHL(f, 8), e) * int64(2^32), uint32(int32(OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))))
  421. end
  422. end
  423. v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  424. v[0x8], v[0x9], v[0xA], v[0xB], v[0xD], v[0xE], v[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  425. bytes_compressed = bytes_compressed + (last_block_size or 128)
  426. v[0xC] = XOR64(sha2_H_lo[5], bytes_compressed) -- t0 = low_8_bytes(bytes_compressed)
  427. -- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
  428. if last_block_size then -- flag f0
  429. v[0xE] = NOT64(v[0xE])
  430. end
  431. if is_last_node then -- flag f1
  432. v[0xF] = NOT64(v[0xF])
  433. end
  434. for j = 1, 12 do
  435. local row = sigma[j]
  436. G(0, 4, 8, 12, row[ 1], row[ 2])
  437. G(1, 5, 9, 13, row[ 3], row[ 4])
  438. G(2, 6, 10, 14, row[ 5], row[ 6])
  439. G(3, 7, 11, 15, row[ 7], row[ 8])
  440. G(0, 5, 10, 15, row[ 9], row[10])
  441. G(1, 6, 11, 12, row[11], row[12])
  442. G(2, 7, 8, 13, row[13], row[14])
  443. G(3, 4, 9, 14, row[15], row[16])
  444. end
  445. h1 = XOR64(h1, v[0x0], v[0x8])
  446. h2 = XOR64(h2, v[0x1], v[0x9])
  447. h3 = XOR64(h3, v[0x2], v[0xA])
  448. h4 = XOR64(h4, v[0x3], v[0xB])
  449. h5 = XOR64(h5, v[0x4], v[0xC])
  450. h6 = XOR64(h6, v[0x5], v[0xD])
  451. h7 = XOR64(h7, v[0x6], v[0xE])
  452. h8 = XOR64(h8, v[0x7], v[0xF])
  453. end
  454. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  455. return bytes_compressed
  456. end
  457. end
  458. -- SHA-3 implementation for "LuaJIT 2.1 + FFI" branch
  459. local arr64_t = ffi.typeof"int64_t[?]"
  460. -- lanes array is indexed from 0
  461. lanes_index_base = 0
  462. hi_factor_keccak = int64(2^32)
  463. function create_array_of_lanes()
  464. return arr64_t(30) -- 25 + 5 for temporary usage
  465. end
  466. function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
  467. -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  468. local RC = sha3_RC_lo
  469. local qwords_qty = SHR(block_size_in_bytes, 3)
  470. for pos = offs, offs + size - 1, block_size_in_bytes do
  471. for j = 0, qwords_qty - 1 do
  472. pos = pos + 8
  473. local h, g, f, e, d, c, b, a = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
  474. lanes[j] = XOR64(lanes[j], OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))))
  475. end
  476. for round_idx = 1, 24 do
  477. for j = 0, 4 do
  478. lanes[25 + j] = XOR64(lanes[j], lanes[j+5], lanes[j+10], lanes[j+15], lanes[j+20])
  479. end
  480. local D = XOR64(lanes[25], ROL64(lanes[27], 1))
  481. lanes[1], lanes[6], lanes[11], lanes[16] = ROL64(XOR64(D, lanes[6]), 44), ROL64(XOR64(D, lanes[16]), 45), ROL64(XOR64(D, lanes[1]), 1), ROL64(XOR64(D, lanes[11]), 10)
  482. lanes[21] = ROL64(XOR64(D, lanes[21]), 2)
  483. D = XOR64(lanes[26], ROL64(lanes[28], 1))
  484. lanes[2], lanes[7], lanes[12], lanes[22] = ROL64(XOR64(D, lanes[12]), 43), ROL64(XOR64(D, lanes[22]), 61), ROL64(XOR64(D, lanes[7]), 6), ROL64(XOR64(D, lanes[2]), 62)
  485. lanes[17] = ROL64(XOR64(D, lanes[17]), 15)
  486. D = XOR64(lanes[27], ROL64(lanes[29], 1))
  487. lanes[3], lanes[8], lanes[18], lanes[23] = ROL64(XOR64(D, lanes[18]), 21), ROL64(XOR64(D, lanes[3]), 28), ROL64(XOR64(D, lanes[23]), 56), ROL64(XOR64(D, lanes[8]), 55)
  488. lanes[13] = ROL64(XOR64(D, lanes[13]), 25)
  489. D = XOR64(lanes[28], ROL64(lanes[25], 1))
  490. lanes[4], lanes[14], lanes[19], lanes[24] = ROL64(XOR64(D, lanes[24]), 14), ROL64(XOR64(D, lanes[19]), 8), ROL64(XOR64(D, lanes[4]), 27), ROL64(XOR64(D, lanes[14]), 39)
  491. lanes[9] = ROL64(XOR64(D, lanes[9]), 20)
  492. D = XOR64(lanes[29], ROL64(lanes[26], 1))
  493. lanes[5], lanes[10], lanes[15], lanes[20] = ROL64(XOR64(D, lanes[10]), 3), ROL64(XOR64(D, lanes[20]), 18), ROL64(XOR64(D, lanes[5]), 36), ROL64(XOR64(D, lanes[15]), 41)
  494. lanes[0] = XOR64(D, lanes[0])
  495. lanes[0], lanes[1], lanes[2], lanes[3], lanes[4] = XOR64(lanes[0], AND64(NOT64(lanes[1]), lanes[2]), RC[round_idx]), XOR64(lanes[1], AND64(NOT64(lanes[2]), lanes[3])), XOR64(lanes[2], AND64(NOT64(lanes[3]), lanes[4])), XOR64(lanes[3], AND64(NOT64(lanes[4]), lanes[0])), XOR64(lanes[4], AND64(NOT64(lanes[0]), lanes[1]))
  496. lanes[5], lanes[6], lanes[7], lanes[8], lanes[9] = XOR64(lanes[8], AND64(NOT64(lanes[9]), lanes[5])), XOR64(lanes[9], AND64(NOT64(lanes[5]), lanes[6])), XOR64(lanes[5], AND64(NOT64(lanes[6]), lanes[7])), XOR64(lanes[6], AND64(NOT64(lanes[7]), lanes[8])), XOR64(lanes[7], AND64(NOT64(lanes[8]), lanes[9]))
  497. lanes[10], lanes[11], lanes[12], lanes[13], lanes[14] = XOR64(lanes[11], AND64(NOT64(lanes[12]), lanes[13])), XOR64(lanes[12], AND64(NOT64(lanes[13]), lanes[14])), XOR64(lanes[13], AND64(NOT64(lanes[14]), lanes[10])), XOR64(lanes[14], AND64(NOT64(lanes[10]), lanes[11])), XOR64(lanes[10], AND64(NOT64(lanes[11]), lanes[12]))
  498. lanes[15], lanes[16], lanes[17], lanes[18], lanes[19] = XOR64(lanes[19], AND64(NOT64(lanes[15]), lanes[16])), XOR64(lanes[15], AND64(NOT64(lanes[16]), lanes[17])), XOR64(lanes[16], AND64(NOT64(lanes[17]), lanes[18])), XOR64(lanes[17], AND64(NOT64(lanes[18]), lanes[19])), XOR64(lanes[18], AND64(NOT64(lanes[19]), lanes[15]))
  499. lanes[20], lanes[21], lanes[22], lanes[23], lanes[24] = XOR64(lanes[22], AND64(NOT64(lanes[23]), lanes[24])), XOR64(lanes[23], AND64(NOT64(lanes[24]), lanes[20])), XOR64(lanes[24], AND64(NOT64(lanes[20]), lanes[21])), XOR64(lanes[20], AND64(NOT64(lanes[21]), lanes[22])), XOR64(lanes[21], AND64(NOT64(lanes[22]), lanes[23]))
  500. end
  501. end
  502. end
  503. local A5_long = 0xA5A5A5A5 * int64(2^32 + 1) -- It's impossible to use constant 0xA5A5A5A5A5A5A5A5LL because it will raise syntax error on other Lua versions
  504. function XORA5(long, long2)
  505. return XOR64(long, long2 or A5_long)
  506. end
  507. -- SHA512 implementation for "LuaJIT 2.1 + FFI" branch
  508. function sha512_feed_128(H, _, str, offs, size)
  509. -- offs >= 0, size >= 0, size is multiple of 128
  510. local W, K = common_W_FFI_int64, sha2_K_lo
  511. for pos = offs, offs + size - 1, 128 do
  512. for j = 0, 15 do
  513. pos = pos + 8
  514. local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
  515. W[j] = OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h))))
  516. end
  517. for j = 16, 79 do
  518. local a, b = W[j-15], W[j-2]
  519. W[j] = XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7)) + XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6)) + W[j-7] + W[j-16]
  520. end
  521. local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  522. for j = 0, 79, 8 do
  523. local z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+1] + W[j]
  524. h, g, f, e = g, f, e, z + d
  525. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  526. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+2] + W[j+1]
  527. h, g, f, e = g, f, e, z + d
  528. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  529. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+3] + W[j+2]
  530. h, g, f, e = g, f, e, z + d
  531. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  532. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+4] + W[j+3]
  533. h, g, f, e = g, f, e, z + d
  534. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  535. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+5] + W[j+4]
  536. h, g, f, e = g, f, e, z + d
  537. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  538. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+6] + W[j+5]
  539. h, g, f, e = g, f, e, z + d
  540. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  541. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+7] + W[j+6]
  542. h, g, f, e = g, f, e, z + d
  543. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  544. z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+8] + W[j+7]
  545. h, g, f, e = g, f, e, z + d
  546. d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  547. end
  548. H[1] = a + H[1]
  549. H[2] = b + H[2]
  550. H[3] = c + H[3]
  551. H[4] = d + H[4]
  552. H[5] = e + H[5]
  553. H[6] = f + H[6]
  554. H[7] = g + H[7]
  555. H[8] = h + H[8]
  556. end
  557. end
  558. else -- LuaJIT 2.0 doesn't support 64-bit bitwise operations
  559. local U = ffi.new("union{int64_t i64; struct{int32_t "..(ffi.abi("le") and "lo, hi" or "hi, lo")..";} i32;}[3]")
  560. -- this array of unions is used for fast splitting int64 into int32_high and int32_low
  561. -- "xorrific" 64-bit functions :-)
  562. -- int64 input is splitted into two int32 parts, some bitwise 32-bit operations are performed, finally the result is converted to int64
  563. -- these functions are needed because bit.* functions in LuaJIT 2.0 don't work with int64_t
  564. local function XORROR64_1(a)
  565. -- return XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7))
  566. U[0].i64 = a
  567. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  568. local t_lo = XOR(SHR(a_lo, 1), SHL(a_hi, 31), SHR(a_lo, 8), SHL(a_hi, 24), SHR(a_lo, 7), SHL(a_hi, 25))
  569. local t_hi = XOR(SHR(a_hi, 1), SHL(a_lo, 31), SHR(a_hi, 8), SHL(a_lo, 24), SHR(a_hi, 7))
  570. return t_hi * int64(2^32) + uint32(int32(t_lo))
  571. end
  572. local function XORROR64_2(b)
  573. -- return XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6))
  574. U[0].i64 = b
  575. local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
  576. local u_lo = XOR(SHR(b_lo, 19), SHL(b_hi, 13), SHL(b_lo, 3), SHR(b_hi, 29), SHR(b_lo, 6), SHL(b_hi, 26))
  577. local u_hi = XOR(SHR(b_hi, 19), SHL(b_lo, 13), SHL(b_hi, 3), SHR(b_lo, 29), SHR(b_hi, 6))
  578. return u_hi * int64(2^32) + uint32(int32(u_lo))
  579. end
  580. local function XORROR64_3(e)
  581. -- return XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23))
  582. U[0].i64 = e
  583. local e_lo, e_hi = U[0].i32.lo, U[0].i32.hi
  584. local u_lo = XOR(SHR(e_lo, 14), SHL(e_hi, 18), SHR(e_lo, 18), SHL(e_hi, 14), SHL(e_lo, 23), SHR(e_hi, 9))
  585. local u_hi = XOR(SHR(e_hi, 14), SHL(e_lo, 18), SHR(e_hi, 18), SHL(e_lo, 14), SHL(e_hi, 23), SHR(e_lo, 9))
  586. return u_hi * int64(2^32) + uint32(int32(u_lo))
  587. end
  588. local function XORROR64_6(a)
  589. -- return XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30))
  590. U[0].i64 = a
  591. local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
  592. local u_lo = XOR(SHR(b_lo, 28), SHL(b_hi, 4), SHL(b_lo, 30), SHR(b_hi, 2), SHL(b_lo, 25), SHR(b_hi, 7))
  593. local u_hi = XOR(SHR(b_hi, 28), SHL(b_lo, 4), SHL(b_hi, 30), SHR(b_lo, 2), SHL(b_hi, 25), SHR(b_lo, 7))
  594. return u_hi * int64(2^32) + uint32(int32(u_lo))
  595. end
  596. local function XORROR64_4(e, f, g)
  597. -- return XOR64(g, AND64(e, XOR64(f, g)))
  598. U[0].i64 = f
  599. U[1].i64 = g
  600. U[2].i64 = e
  601. local f_lo, f_hi = U[0].i32.lo, U[0].i32.hi
  602. local g_lo, g_hi = U[1].i32.lo, U[1].i32.hi
  603. local e_lo, e_hi = U[2].i32.lo, U[2].i32.hi
  604. local result_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
  605. local result_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
  606. return result_hi * int64(2^32) + uint32(int32(result_lo))
  607. end
  608. local function XORROR64_5(a, b, c)
  609. -- return XOR64(AND64(XOR64(a, b), c), AND64(a, b))
  610. U[0].i64 = a
  611. U[1].i64 = b
  612. U[2].i64 = c
  613. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  614. local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  615. local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
  616. local result_lo = XOR(AND(XOR(a_lo, b_lo), c_lo), AND(a_lo, b_lo))
  617. local result_hi = XOR(AND(XOR(a_hi, b_hi), c_hi), AND(a_hi, b_hi))
  618. return result_hi * int64(2^32) + uint32(int32(result_lo))
  619. end
  620. local function XORROR64_7(a, b, m)
  621. -- return ROR64(XOR64(a, b), m), m = 1..31
  622. U[0].i64 = a
  623. U[1].i64 = b
  624. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  625. local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  626. local c_lo, c_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  627. local t_lo = XOR(SHR(c_lo, m), SHL(c_hi, -m))
  628. local t_hi = XOR(SHR(c_hi, m), SHL(c_lo, -m))
  629. return t_hi * int64(2^32) + uint32(int32(t_lo))
  630. end
  631. local function XORROR64_8(a, b)
  632. -- return ROL64(XOR64(a, b), 1)
  633. U[0].i64 = a
  634. U[1].i64 = b
  635. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  636. local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  637. local c_lo, c_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  638. local t_lo = XOR(SHL(c_lo, 1), SHR(c_hi, 31))
  639. local t_hi = XOR(SHL(c_hi, 1), SHR(c_lo, 31))
  640. return t_hi * int64(2^32) + uint32(int32(t_lo))
  641. end
  642. local function XORROR64_9(a, b)
  643. -- return ROR64(XOR64(a, b), 32)
  644. U[0].i64 = a
  645. U[1].i64 = b
  646. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  647. local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  648. local t_hi, t_lo = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  649. return t_hi * int64(2^32) + uint32(int32(t_lo))
  650. end
  651. local function XOR64(a, b)
  652. -- return XOR64(a, b)
  653. U[0].i64 = a
  654. U[1].i64 = b
  655. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  656. local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  657. local t_lo, t_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  658. return t_hi * int64(2^32) + uint32(int32(t_lo))
  659. end
  660. local function XORROR64_11(a, b, c)
  661. -- return XOR64(a, b, c)
  662. U[0].i64 = a
  663. U[1].i64 = b
  664. U[2].i64 = c
  665. local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  666. local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  667. local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
  668. local t_lo, t_hi = XOR(a_lo, b_lo, c_lo), XOR(a_hi, b_hi, c_hi)
  669. return t_hi * int64(2^32) + uint32(int32(t_lo))
  670. end
  671. function XORA5(long, long2)
  672. -- return XOR64(long, long2 or 0xA5A5A5A5A5A5A5A5)
  673. U[0].i64 = long
  674. local lo32, hi32 = U[0].i32.lo, U[0].i32.hi
  675. local long2_lo, long2_hi = 0xA5A5A5A5, 0xA5A5A5A5
  676. if long2 then
  677. U[1].i64 = long2
  678. long2_lo, long2_hi = U[1].i32.lo, U[1].i32.hi
  679. end
  680. lo32 = XOR(lo32, long2_lo)
  681. hi32 = XOR(hi32, long2_hi)
  682. return hi32 * int64(2^32) + uint32(int32(lo32))
  683. end
  684. function HEX64(long)
  685. U[0].i64 = long
  686. return HEX(U[0].i32.hi)..HEX(U[0].i32.lo)
  687. end
  688. -- SHA512 implementation for "LuaJIT 2.0 + FFI" branch
  689. function sha512_feed_128(H, _, str, offs, size)
  690. -- offs >= 0, size >= 0, size is multiple of 128
  691. local W, K = common_W_FFI_int64, sha2_K_lo
  692. for pos = offs, offs + size - 1, 128 do
  693. for j = 0, 15 do
  694. pos = pos + 8
  695. local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
  696. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32) + uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))
  697. end
  698. for j = 16, 79 do
  699. W[j] = XORROR64_1(W[j-15]) + XORROR64_2(W[j-2]) + W[j-7] + W[j-16]
  700. end
  701. local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  702. for j = 0, 79, 8 do
  703. local z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+1] + W[j]
  704. h, g, f, e = g, f, e, z + d
  705. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  706. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+2] + W[j+1]
  707. h, g, f, e = g, f, e, z + d
  708. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  709. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+3] + W[j+2]
  710. h, g, f, e = g, f, e, z + d
  711. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  712. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+4] + W[j+3]
  713. h, g, f, e = g, f, e, z + d
  714. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  715. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+5] + W[j+4]
  716. h, g, f, e = g, f, e, z + d
  717. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  718. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+6] + W[j+5]
  719. h, g, f, e = g, f, e, z + d
  720. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  721. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+7] + W[j+6]
  722. h, g, f, e = g, f, e, z + d
  723. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  724. z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+8] + W[j+7]
  725. h, g, f, e = g, f, e, z + d
  726. d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  727. end
  728. H[1] = a + H[1]
  729. H[2] = b + H[2]
  730. H[3] = c + H[3]
  731. H[4] = d + H[4]
  732. H[5] = e + H[5]
  733. H[6] = f + H[6]
  734. H[7] = g + H[7]
  735. H[8] = h + H[8]
  736. end
  737. end
  738. -- BLAKE2b implementation for "LuaJIT 2.0 + FFI" branch
  739. do
  740. local v = ffi.new("int64_t[?]", 16)
  741. local W = common_W_blake2b
  742. local function G(a, b, c, d, k1, k2)
  743. local va, vb, vc, vd = v[a], v[b], v[c], v[d]
  744. va = W[k1] + (va + vb)
  745. vd = XORROR64_9(vd, va)
  746. vc = vc + vd
  747. vb = XORROR64_7(vb, vc, 24)
  748. va = W[k2] + (va + vb)
  749. vd = XORROR64_7(vd, va, 16)
  750. vc = vc + vd
  751. vb = XORROR64_8(vb, vc)
  752. v[a], v[b], v[c], v[d] = va, vb, vc, vd
  753. end
  754. function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  755. -- offs >= 0, size >= 0, size is multiple of 128
  756. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  757. for pos = offs, offs + size - 1, 128 do
  758. if str then
  759. for j = 1, 16 do
  760. pos = pos + 8
  761. local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)
  762. W[j] = XOR64(OR(SHL(h, 24), SHL(g, 16), SHL(f, 8), e) * int64(2^32), uint32(int32(OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))))
  763. end
  764. end
  765. v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  766. v[0x8], v[0x9], v[0xA], v[0xB], v[0xD], v[0xE], v[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  767. bytes_compressed = bytes_compressed + (last_block_size or 128)
  768. v[0xC] = XOR64(sha2_H_lo[5], bytes_compressed) -- t0 = low_8_bytes(bytes_compressed)
  769. -- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
  770. if last_block_size then -- flag f0
  771. v[0xE] = -1 - v[0xE]
  772. end
  773. if is_last_node then -- flag f1
  774. v[0xF] = -1 - v[0xF]
  775. end
  776. for j = 1, 12 do
  777. local row = sigma[j]
  778. G(0, 4, 8, 12, row[ 1], row[ 2])
  779. G(1, 5, 9, 13, row[ 3], row[ 4])
  780. G(2, 6, 10, 14, row[ 5], row[ 6])
  781. G(3, 7, 11, 15, row[ 7], row[ 8])
  782. G(0, 5, 10, 15, row[ 9], row[10])
  783. G(1, 6, 11, 12, row[11], row[12])
  784. G(2, 7, 8, 13, row[13], row[14])
  785. G(3, 4, 9, 14, row[15], row[16])
  786. end
  787. h1 = XORROR64_11(h1, v[0x0], v[0x8])
  788. h2 = XORROR64_11(h2, v[0x1], v[0x9])
  789. h3 = XORROR64_11(h3, v[0x2], v[0xA])
  790. h4 = XORROR64_11(h4, v[0x3], v[0xB])
  791. h5 = XORROR64_11(h5, v[0x4], v[0xC])
  792. h6 = XORROR64_11(h6, v[0x5], v[0xD])
  793. h7 = XORROR64_11(h7, v[0x6], v[0xE])
  794. h8 = XORROR64_11(h8, v[0x7], v[0xF])
  795. end
  796. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  797. return bytes_compressed
  798. end
  799. end
  800. end
  801. -- MD5 implementation for "LuaJIT with FFI" branch
  802. function md5_feed_64(H, str, offs, size)
  803. -- offs >= 0, size >= 0, size is multiple of 64
  804. local W, K = common_W_FFI_int32, md5_K
  805. for pos = offs, offs + size - 1, 64 do
  806. for j = 0, 15 do
  807. pos = pos + 4
  808. local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
  809. W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  810. end
  811. local a, b, c, d = H[1], H[2], H[3], H[4]
  812. for j = 0, 15, 4 do
  813. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j ] + a), 7) + b)
  814. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+1] + a), 12) + b)
  815. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+2] + a), 17) + b)
  816. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+4] + W[j+3] + a), 22) + b)
  817. end
  818. for j = 16, 31, 4 do
  819. local g = 5*j
  820. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 1, 15)] + a), 5) + b)
  821. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 6, 15)] + a), 9) + b)
  822. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 5, 15)] + a), 14) + b)
  823. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+4] + W[AND(g , 15)] + a), 20) + b)
  824. end
  825. for j = 32, 47, 4 do
  826. local g = 3*j
  827. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 5, 15)] + a), 4) + b)
  828. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 8, 15)] + a), 11) + b)
  829. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 5, 15)] + a), 16) + b)
  830. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+4] + W[AND(g - 2, 15)] + a), 23) + b)
  831. end
  832. for j = 48, 63, 4 do
  833. local g = 7*j
  834. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15)] + a), 6) + b)
  835. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15)] + a), 10) + b)
  836. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15)] + a), 15) + b)
  837. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+4] + W[AND(g + 5, 15)] + a), 21) + b)
  838. end
  839. H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  840. end
  841. end
  842. -- SHA-1 implementation for "LuaJIT with FFI" branch
  843. function sha1_feed_64(H, str, offs, size)
  844. -- offs >= 0, size >= 0, size is multiple of 64
  845. local W = common_W_FFI_int32
  846. for pos = offs, offs + size - 1, 64 do
  847. for j = 0, 15 do
  848. pos = pos + 4
  849. local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
  850. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  851. end
  852. for j = 16, 79 do
  853. W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
  854. end
  855. local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
  856. for j = 0, 19, 5 do
  857. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2))
  858. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
  859. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
  860. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
  861. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
  862. end
  863. for j = 20, 39, 5 do
  864. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3)
  865. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
  866. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
  867. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
  868. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
  869. end
  870. for j = 40, 59, 5 do
  871. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5)
  872. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
  873. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
  874. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
  875. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
  876. end
  877. for j = 60, 79, 5 do
  878. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10)
  879. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
  880. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
  881. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
  882. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
  883. end
  884. H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
  885. end
  886. end
  887. end
  888. if branch == "FFI" and not is_LuaJIT_21 or branch == "LJ" then
  889. if branch == "FFI" then
  890. local arr32_t = ffi.typeof"int32_t[?]"
  891. function create_array_of_lanes()
  892. return arr32_t(31) -- 25 + 5 + 1 (due to 1-based indexing)
  893. end
  894. end
  895. -- SHA-3 implementation for "LuaJIT 2.0 + FFI" and "LuaJIT without FFI" branches
  896. function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
  897. -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  898. local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
  899. local qwords_qty = SHR(block_size_in_bytes, 3)
  900. for pos = offs, offs + size - 1, block_size_in_bytes do
  901. for j = 1, qwords_qty do
  902. local a, b, c, d = byte(str, pos + 1, pos + 4)
  903. lanes_lo[j] = XOR(lanes_lo[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
  904. pos = pos + 8
  905. a, b, c, d = byte(str, pos - 3, pos)
  906. lanes_hi[j] = XOR(lanes_hi[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
  907. end
  908. for round_idx = 1, 24 do
  909. for j = 1, 5 do
  910. lanes_lo[25 + j] = XOR(lanes_lo[j], lanes_lo[j + 5], lanes_lo[j + 10], lanes_lo[j + 15], lanes_lo[j + 20])
  911. end
  912. for j = 1, 5 do
  913. lanes_hi[25 + j] = XOR(lanes_hi[j], lanes_hi[j + 5], lanes_hi[j + 10], lanes_hi[j + 15], lanes_hi[j + 20])
  914. end
  915. local D_lo = XOR(lanes_lo[26], SHL(lanes_lo[28], 1), SHR(lanes_hi[28], 31))
  916. local D_hi = XOR(lanes_hi[26], SHL(lanes_hi[28], 1), SHR(lanes_lo[28], 31))
  917. lanes_lo[2], lanes_hi[2], lanes_lo[7], lanes_hi[7], lanes_lo[12], lanes_hi[12], lanes_lo[17], lanes_hi[17] = XOR(SHR(XOR(D_lo, lanes_lo[7]), 20), SHL(XOR(D_hi, lanes_hi[7]), 12)), XOR(SHR(XOR(D_hi, lanes_hi[7]), 20), SHL(XOR(D_lo, lanes_lo[7]), 12)), XOR(SHR(XOR(D_lo, lanes_lo[17]), 19), SHL(XOR(D_hi, lanes_hi[17]), 13)), XOR(SHR(XOR(D_hi, lanes_hi[17]), 19), SHL(XOR(D_lo, lanes_lo[17]), 13)), XOR(SHL(XOR(D_lo, lanes_lo[2]), 1), SHR(XOR(D_hi, lanes_hi[2]), 31)), XOR(SHL(XOR(D_hi, lanes_hi[2]), 1), SHR(XOR(D_lo, lanes_lo[2]), 31)), XOR(SHL(XOR(D_lo, lanes_lo[12]), 10), SHR(XOR(D_hi, lanes_hi[12]), 22)), XOR(SHL(XOR(D_hi, lanes_hi[12]), 10), SHR(XOR(D_lo, lanes_lo[12]), 22))
  918. local L, H = XOR(D_lo, lanes_lo[22]), XOR(D_hi, lanes_hi[22])
  919. lanes_lo[22], lanes_hi[22] = XOR(SHL(L, 2), SHR(H, 30)), XOR(SHL(H, 2), SHR(L, 30))
  920. D_lo = XOR(lanes_lo[27], SHL(lanes_lo[29], 1), SHR(lanes_hi[29], 31))
  921. D_hi = XOR(lanes_hi[27], SHL(lanes_hi[29], 1), SHR(lanes_lo[29], 31))
  922. lanes_lo[3], lanes_hi[3], lanes_lo[8], lanes_hi[8], lanes_lo[13], lanes_hi[13], lanes_lo[23], lanes_hi[23] = XOR(SHR(XOR(D_lo, lanes_lo[13]), 21), SHL(XOR(D_hi, lanes_hi[13]), 11)), XOR(SHR(XOR(D_hi, lanes_hi[13]), 21), SHL(XOR(D_lo, lanes_lo[13]), 11)), XOR(SHR(XOR(D_lo, lanes_lo[23]), 3), SHL(XOR(D_hi, lanes_hi[23]), 29)), XOR(SHR(XOR(D_hi, lanes_hi[23]), 3), SHL(XOR(D_lo, lanes_lo[23]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[8]), 6), SHR(XOR(D_hi, lanes_hi[8]), 26)), XOR(SHL(XOR(D_hi, lanes_hi[8]), 6), SHR(XOR(D_lo, lanes_lo[8]), 26)), XOR(SHR(XOR(D_lo, lanes_lo[3]), 2), SHL(XOR(D_hi, lanes_hi[3]), 30)), XOR(SHR(XOR(D_hi, lanes_hi[3]), 2), SHL(XOR(D_lo, lanes_lo[3]), 30))
  923. L, H = XOR(D_lo, lanes_lo[18]), XOR(D_hi, lanes_hi[18])
  924. lanes_lo[18], lanes_hi[18] = XOR(SHL(L, 15), SHR(H, 17)), XOR(SHL(H, 15), SHR(L, 17))
  925. D_lo = XOR(lanes_lo[28], SHL(lanes_lo[30], 1), SHR(lanes_hi[30], 31))
  926. D_hi = XOR(lanes_hi[28], SHL(lanes_hi[30], 1), SHR(lanes_lo[30], 31))
  927. lanes_lo[4], lanes_hi[4], lanes_lo[9], lanes_hi[9], lanes_lo[19], lanes_hi[19], lanes_lo[24], lanes_hi[24] = XOR(SHL(XOR(D_lo, lanes_lo[19]), 21), SHR(XOR(D_hi, lanes_hi[19]), 11)), XOR(SHL(XOR(D_hi, lanes_hi[19]), 21), SHR(XOR(D_lo, lanes_lo[19]), 11)), XOR(SHL(XOR(D_lo, lanes_lo[4]), 28), SHR(XOR(D_hi, lanes_hi[4]), 4)), XOR(SHL(XOR(D_hi, lanes_hi[4]), 28), SHR(XOR(D_lo, lanes_lo[4]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[24]), 8), SHL(XOR(D_hi, lanes_hi[24]), 24)), XOR(SHR(XOR(D_hi, lanes_hi[24]), 8), SHL(XOR(D_lo, lanes_lo[24]), 24)), XOR(SHR(XOR(D_lo, lanes_lo[9]), 9), SHL(XOR(D_hi, lanes_hi[9]), 23)), XOR(SHR(XOR(D_hi, lanes_hi[9]), 9), SHL(XOR(D_lo, lanes_lo[9]), 23))
  928. L, H = XOR(D_lo, lanes_lo[14]), XOR(D_hi, lanes_hi[14])
  929. lanes_lo[14], lanes_hi[14] = XOR(SHL(L, 25), SHR(H, 7)), XOR(SHL(H, 25), SHR(L, 7))
  930. D_lo = XOR(lanes_lo[29], SHL(lanes_lo[26], 1), SHR(lanes_hi[26], 31))
  931. D_hi = XOR(lanes_hi[29], SHL(lanes_hi[26], 1), SHR(lanes_lo[26], 31))
  932. lanes_lo[5], lanes_hi[5], lanes_lo[15], lanes_hi[15], lanes_lo[20], lanes_hi[20], lanes_lo[25], lanes_hi[25] = XOR(SHL(XOR(D_lo, lanes_lo[25]), 14), SHR(XOR(D_hi, lanes_hi[25]), 18)), XOR(SHL(XOR(D_hi, lanes_hi[25]), 14), SHR(XOR(D_lo, lanes_lo[25]), 18)), XOR(SHL(XOR(D_lo, lanes_lo[20]), 8), SHR(XOR(D_hi, lanes_hi[20]), 24)), XOR(SHL(XOR(D_hi, lanes_hi[20]), 8), SHR(XOR(D_lo, lanes_lo[20]), 24)), XOR(SHL(XOR(D_lo, lanes_lo[5]), 27), SHR(XOR(D_hi, lanes_hi[5]), 5)), XOR(SHL(XOR(D_hi, lanes_hi[5]), 27), SHR(XOR(D_lo, lanes_lo[5]), 5)), XOR(SHR(XOR(D_lo, lanes_lo[15]), 25), SHL(XOR(D_hi, lanes_hi[15]), 7)), XOR(SHR(XOR(D_hi, lanes_hi[15]), 25), SHL(XOR(D_lo, lanes_lo[15]), 7))
  933. L, H = XOR(D_lo, lanes_lo[10]), XOR(D_hi, lanes_hi[10])
  934. lanes_lo[10], lanes_hi[10] = XOR(SHL(L, 20), SHR(H, 12)), XOR(SHL(H, 20), SHR(L, 12))
  935. D_lo = XOR(lanes_lo[30], SHL(lanes_lo[27], 1), SHR(lanes_hi[27], 31))
  936. D_hi = XOR(lanes_hi[30], SHL(lanes_hi[27], 1), SHR(lanes_lo[27], 31))
  937. lanes_lo[6], lanes_hi[6], lanes_lo[11], lanes_hi[11], lanes_lo[16], lanes_hi[16], lanes_lo[21], lanes_hi[21] = XOR(SHL(XOR(D_lo, lanes_lo[11]), 3), SHR(XOR(D_hi, lanes_hi[11]), 29)), XOR(SHL(XOR(D_hi, lanes_hi[11]), 3), SHR(XOR(D_lo, lanes_lo[11]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[21]), 18), SHR(XOR(D_hi, lanes_hi[21]), 14)), XOR(SHL(XOR(D_hi, lanes_hi[21]), 18), SHR(XOR(D_lo, lanes_lo[21]), 14)), XOR(SHR(XOR(D_lo, lanes_lo[6]), 28), SHL(XOR(D_hi, lanes_hi[6]), 4)), XOR(SHR(XOR(D_hi, lanes_hi[6]), 28), SHL(XOR(D_lo, lanes_lo[6]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[16]), 23), SHL(XOR(D_hi, lanes_hi[16]), 9)), XOR(SHR(XOR(D_hi, lanes_hi[16]), 23), SHL(XOR(D_lo, lanes_lo[16]), 9))
  938. lanes_lo[1], lanes_hi[1] = XOR(D_lo, lanes_lo[1]), XOR(D_hi, lanes_hi[1])
  939. lanes_lo[1], lanes_lo[2], lanes_lo[3], lanes_lo[4], lanes_lo[5] = XOR(lanes_lo[1], AND(NOT(lanes_lo[2]), lanes_lo[3]), RC_lo[round_idx]), XOR(lanes_lo[2], AND(NOT(lanes_lo[3]), lanes_lo[4])), XOR(lanes_lo[3], AND(NOT(lanes_lo[4]), lanes_lo[5])), XOR(lanes_lo[4], AND(NOT(lanes_lo[5]), lanes_lo[1])), XOR(lanes_lo[5], AND(NOT(lanes_lo[1]), lanes_lo[2]))
  940. lanes_lo[6], lanes_lo[7], lanes_lo[8], lanes_lo[9], lanes_lo[10] = XOR(lanes_lo[9], AND(NOT(lanes_lo[10]), lanes_lo[6])), XOR(lanes_lo[10], AND(NOT(lanes_lo[6]), lanes_lo[7])), XOR(lanes_lo[6], AND(NOT(lanes_lo[7]), lanes_lo[8])), XOR(lanes_lo[7], AND(NOT(lanes_lo[8]), lanes_lo[9])), XOR(lanes_lo[8], AND(NOT(lanes_lo[9]), lanes_lo[10]))
  941. lanes_lo[11], lanes_lo[12], lanes_lo[13], lanes_lo[14], lanes_lo[15] = XOR(lanes_lo[12], AND(NOT(lanes_lo[13]), lanes_lo[14])), XOR(lanes_lo[13], AND(NOT(lanes_lo[14]), lanes_lo[15])), XOR(lanes_lo[14], AND(NOT(lanes_lo[15]), lanes_lo[11])), XOR(lanes_lo[15], AND(NOT(lanes_lo[11]), lanes_lo[12])), XOR(lanes_lo[11], AND(NOT(lanes_lo[12]), lanes_lo[13]))
  942. lanes_lo[16], lanes_lo[17], lanes_lo[18], lanes_lo[19], lanes_lo[20] = XOR(lanes_lo[20], AND(NOT(lanes_lo[16]), lanes_lo[17])), XOR(lanes_lo[16], AND(NOT(lanes_lo[17]), lanes_lo[18])), XOR(lanes_lo[17], AND(NOT(lanes_lo[18]), lanes_lo[19])), XOR(lanes_lo[18], AND(NOT(lanes_lo[19]), lanes_lo[20])), XOR(lanes_lo[19], AND(NOT(lanes_lo[20]), lanes_lo[16]))
  943. lanes_lo[21], lanes_lo[22], lanes_lo[23], lanes_lo[24], lanes_lo[25] = XOR(lanes_lo[23], AND(NOT(lanes_lo[24]), lanes_lo[25])), XOR(lanes_lo[24], AND(NOT(lanes_lo[25]), lanes_lo[21])), XOR(lanes_lo[25], AND(NOT(lanes_lo[21]), lanes_lo[22])), XOR(lanes_lo[21], AND(NOT(lanes_lo[22]), lanes_lo[23])), XOR(lanes_lo[22], AND(NOT(lanes_lo[23]), lanes_lo[24]))
  944. lanes_hi[1], lanes_hi[2], lanes_hi[3], lanes_hi[4], lanes_hi[5] = XOR(lanes_hi[1], AND(NOT(lanes_hi[2]), lanes_hi[3]), RC_hi[round_idx]), XOR(lanes_hi[2], AND(NOT(lanes_hi[3]), lanes_hi[4])), XOR(lanes_hi[3], AND(NOT(lanes_hi[4]), lanes_hi[5])), XOR(lanes_hi[4], AND(NOT(lanes_hi[5]), lanes_hi[1])), XOR(lanes_hi[5], AND(NOT(lanes_hi[1]), lanes_hi[2]))
  945. lanes_hi[6], lanes_hi[7], lanes_hi[8], lanes_hi[9], lanes_hi[10] = XOR(lanes_hi[9], AND(NOT(lanes_hi[10]), lanes_hi[6])), XOR(lanes_hi[10], AND(NOT(lanes_hi[6]), lanes_hi[7])), XOR(lanes_hi[6], AND(NOT(lanes_hi[7]), lanes_hi[8])), XOR(lanes_hi[7], AND(NOT(lanes_hi[8]), lanes_hi[9])), XOR(lanes_hi[8], AND(NOT(lanes_hi[9]), lanes_hi[10]))
  946. lanes_hi[11], lanes_hi[12], lanes_hi[13], lanes_hi[14], lanes_hi[15] = XOR(lanes_hi[12], AND(NOT(lanes_hi[13]), lanes_hi[14])), XOR(lanes_hi[13], AND(NOT(lanes_hi[14]), lanes_hi[15])), XOR(lanes_hi[14], AND(NOT(lanes_hi[15]), lanes_hi[11])), XOR(lanes_hi[15], AND(NOT(lanes_hi[11]), lanes_hi[12])), XOR(lanes_hi[11], AND(NOT(lanes_hi[12]), lanes_hi[13]))
  947. lanes_hi[16], lanes_hi[17], lanes_hi[18], lanes_hi[19], lanes_hi[20] = XOR(lanes_hi[20], AND(NOT(lanes_hi[16]), lanes_hi[17])), XOR(lanes_hi[16], AND(NOT(lanes_hi[17]), lanes_hi[18])), XOR(lanes_hi[17], AND(NOT(lanes_hi[18]), lanes_hi[19])), XOR(lanes_hi[18], AND(NOT(lanes_hi[19]), lanes_hi[20])), XOR(lanes_hi[19], AND(NOT(lanes_hi[20]), lanes_hi[16]))
  948. lanes_hi[21], lanes_hi[22], lanes_hi[23], lanes_hi[24], lanes_hi[25] = XOR(lanes_hi[23], AND(NOT(lanes_hi[24]), lanes_hi[25])), XOR(lanes_hi[24], AND(NOT(lanes_hi[25]), lanes_hi[21])), XOR(lanes_hi[25], AND(NOT(lanes_hi[21]), lanes_hi[22])), XOR(lanes_hi[21], AND(NOT(lanes_hi[22]), lanes_hi[23])), XOR(lanes_hi[22], AND(NOT(lanes_hi[23]), lanes_hi[24]))
  949. end
  950. end
  951. end
  952. end
  953. if branch == "LJ" then
  954. -- SHA256 implementation for "LuaJIT without FFI" branch
  955. function sha256_feed_64(H, str, offs, size)
  956. -- offs >= 0, size >= 0, size is multiple of 64
  957. local W, K = common_W, sha2_K_hi
  958. for pos = offs, offs + size - 1, 64 do
  959. for j = 1, 16 do
  960. pos = pos + 4
  961. local a, b, c, d = byte(str, pos - 3, pos)
  962. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  963. end
  964. for j = 17, 64 do
  965. local a, b = W[j-15], W[j-2]
  966. W[j] = NORM( NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) ) + NORM( W[j-7] + W[j-16] ) )
  967. end
  968. local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  969. for j = 1, 64, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
  970. local z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j] + W[j] + h) )
  971. h, g, f, e = g, f, e, NORM(d + z)
  972. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  973. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+1] + W[j+1] + h) )
  974. h, g, f, e = g, f, e, NORM(d + z)
  975. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  976. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+2] + W[j+2] + h) )
  977. h, g, f, e = g, f, e, NORM(d + z)
  978. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  979. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+3] + W[j+3] + h) )
  980. h, g, f, e = g, f, e, NORM(d + z)
  981. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  982. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+4] + W[j+4] + h) )
  983. h, g, f, e = g, f, e, NORM(d + z)
  984. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  985. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+5] + W[j+5] + h) )
  986. h, g, f, e = g, f, e, NORM(d + z)
  987. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  988. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+6] + W[j+6] + h) )
  989. h, g, f, e = g, f, e, NORM(d + z)
  990. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  991. z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+7] + W[j+7] + h) )
  992. h, g, f, e = g, f, e, NORM(d + z)
  993. d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  994. end
  995. H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  996. H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
  997. end
  998. end
  999. local function ADD64_4(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi)
  1000. local sum_lo = a_lo % 2^32 + b_lo % 2^32 + c_lo % 2^32 + d_lo % 2^32
  1001. local sum_hi = a_hi + b_hi + c_hi + d_hi
  1002. local result_lo = NORM( sum_lo )
  1003. local result_hi = NORM( sum_hi + floor(sum_lo / 2^32) )
  1004. return result_lo, result_hi
  1005. end
  1006. if LuaJIT_arch == "x86" then -- Special trick is required to avoid "PHI shuffling too complex" on x86 platform
  1007. -- SHA512 implementation for "LuaJIT x86 without FFI" branch
  1008. function sha512_feed_128(H_lo, H_hi, str, offs, size)
  1009. -- offs >= 0, size >= 0, size is multiple of 128
  1010. -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  1011. local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
  1012. for pos = offs, offs + size - 1, 128 do
  1013. for j = 1, 16*2 do
  1014. pos = pos + 4
  1015. local a, b, c, d = byte(str, pos - 3, pos)
  1016. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1017. end
  1018. for jj = 17*2, 80*2, 2 do
  1019. local a_lo, a_hi = W[jj-30], W[jj-31]
  1020. local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
  1021. local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
  1022. local b_lo, b_hi = W[jj-4], W[jj-5]
  1023. local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
  1024. local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
  1025. W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
  1026. end
  1027. local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  1028. local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  1029. local zero = 0
  1030. for j = 1, 80 do
  1031. local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
  1032. local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
  1033. local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
  1034. local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
  1035. local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
  1036. local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
  1037. zero = zero + zero -- this thick is needed to avoid "PHI shuffling too complex" due to PHIs overlap
  1038. h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = OR(zero, g_lo), OR(zero, g_hi), OR(zero, f_lo), OR(zero, f_hi), OR(zero, e_lo), OR(zero, e_hi)
  1039. local sum_lo = z_lo % 2^32 + d_lo % 2^32
  1040. e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
  1041. d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = OR(zero, c_lo), OR(zero, c_hi), OR(zero, b_lo), OR(zero, b_hi), OR(zero, a_lo), OR(zero, a_hi)
  1042. u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
  1043. u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
  1044. t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
  1045. t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
  1046. local sum_lo = z_lo % 2^32 + t_lo % 2^32 + u_lo % 2^32
  1047. a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + t_hi + u_hi + floor(sum_lo / 2^32) )
  1048. end
  1049. H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
  1050. H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
  1051. H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
  1052. H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
  1053. H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
  1054. H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
  1055. H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
  1056. H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
  1057. end
  1058. end
  1059. else -- all platforms except x86
  1060. -- SHA512 implementation for "LuaJIT non-x86 without FFI" branch
  1061. function sha512_feed_128(H_lo, H_hi, str, offs, size)
  1062. -- offs >= 0, size >= 0, size is multiple of 128
  1063. -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  1064. local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
  1065. for pos = offs, offs + size - 1, 128 do
  1066. for j = 1, 16*2 do
  1067. pos = pos + 4
  1068. local a, b, c, d = byte(str, pos - 3, pos)
  1069. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1070. end
  1071. for jj = 17*2, 80*2, 2 do
  1072. local a_lo, a_hi = W[jj-30], W[jj-31]
  1073. local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
  1074. local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
  1075. local b_lo, b_hi = W[jj-4], W[jj-5]
  1076. local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
  1077. local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
  1078. W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
  1079. end
  1080. local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  1081. local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  1082. for j = 1, 80 do
  1083. local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
  1084. local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
  1085. local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
  1086. local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
  1087. local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
  1088. local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
  1089. h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = g_lo, g_hi, f_lo, f_hi, e_lo, e_hi
  1090. local sum_lo = z_lo % 2^32 + d_lo % 2^32
  1091. e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
  1092. d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = c_lo, c_hi, b_lo, b_hi, a_lo, a_hi
  1093. u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
  1094. u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
  1095. t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
  1096. t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
  1097. local sum_lo = z_lo % 2^32 + u_lo % 2^32 + t_lo % 2^32
  1098. a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + u_hi + t_hi + floor(sum_lo / 2^32) )
  1099. end
  1100. H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
  1101. H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
  1102. H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
  1103. H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
  1104. H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
  1105. H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
  1106. H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
  1107. H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
  1108. end
  1109. end
  1110. end
  1111. -- MD5 implementation for "LuaJIT without FFI" branch
  1112. function md5_feed_64(H, str, offs, size)
  1113. -- offs >= 0, size >= 0, size is multiple of 64
  1114. local W, K = common_W, md5_K
  1115. for pos = offs, offs + size - 1, 64 do
  1116. for j = 1, 16 do
  1117. pos = pos + 4
  1118. local a, b, c, d = byte(str, pos - 3, pos)
  1119. W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  1120. end
  1121. local a, b, c, d = H[1], H[2], H[3], H[4]
  1122. for j = 1, 16, 4 do
  1123. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j ] + W[j ] + a), 7) + b)
  1124. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j+1] + a), 12) + b)
  1125. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+2] + a), 17) + b)
  1126. a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+3] + a), 22) + b)
  1127. end
  1128. for j = 17, 32, 4 do
  1129. local g = 5*j-4
  1130. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j ] + W[AND(g , 15) + 1] + a), 5) + b)
  1131. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 5, 15) + 1] + a), 9) + b)
  1132. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 10, 15) + 1] + a), 14) + b)
  1133. a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 1, 15) + 1] + a), 20) + b)
  1134. end
  1135. for j = 33, 48, 4 do
  1136. local g = 3*j+2
  1137. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j ] + W[AND(g , 15) + 1] + a), 4) + b)
  1138. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 3, 15) + 1] + a), 11) + b)
  1139. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 6, 15) + 1] + a), 16) + b)
  1140. a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 7, 15) + 1] + a), 23) + b)
  1141. end
  1142. for j = 49, 64, 4 do
  1143. local g = j*7
  1144. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j ] + W[AND(g - 7, 15) + 1] + a), 6) + b)
  1145. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15) + 1] + a), 10) + b)
  1146. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15) + 1] + a), 15) + b)
  1147. a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15) + 1] + a), 21) + b)
  1148. end
  1149. H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  1150. end
  1151. end
  1152. -- SHA-1 implementation for "LuaJIT without FFI" branch
  1153. function sha1_feed_64(H, str, offs, size)
  1154. -- offs >= 0, size >= 0, size is multiple of 64
  1155. local W = common_W
  1156. for pos = offs, offs + size - 1, 64 do
  1157. for j = 1, 16 do
  1158. pos = pos + 4
  1159. local a, b, c, d = byte(str, pos - 3, pos)
  1160. W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1161. end
  1162. for j = 17, 80 do
  1163. W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
  1164. end
  1165. local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
  1166. for j = 1, 20, 5 do
  1167. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2))
  1168. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
  1169. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
  1170. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
  1171. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
  1172. end
  1173. for j = 21, 40, 5 do
  1174. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3)
  1175. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
  1176. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
  1177. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
  1178. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
  1179. end
  1180. for j = 41, 60, 5 do
  1181. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5)
  1182. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
  1183. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
  1184. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
  1185. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
  1186. end
  1187. for j = 61, 80, 5 do
  1188. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10)
  1189. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
  1190. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
  1191. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
  1192. e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
  1193. end
  1194. H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
  1195. end
  1196. end
  1197. -- BLAKE2b implementation for "LuaJIT without FFI" branch
  1198. do
  1199. local v_lo, v_hi = {}, {}
  1200. local function G(a, b, c, d, k1, k2)
  1201. local W = common_W
  1202. local va_lo, vb_lo, vc_lo, vd_lo = v_lo[a], v_lo[b], v_lo[c], v_lo[d]
  1203. local va_hi, vb_hi, vc_hi, vd_hi = v_hi[a], v_hi[b], v_hi[c], v_hi[d]
  1204. local z = W[2*k1-1] + (va_lo % 2^32 + vb_lo % 2^32)
  1205. va_lo = NORM(z)
  1206. va_hi = NORM(W[2*k1] + (va_hi + vb_hi + floor(z / 2^32)))
  1207. vd_lo, vd_hi = XOR(vd_hi, va_hi), XOR(vd_lo, va_lo)
  1208. z = vc_lo % 2^32 + vd_lo % 2^32
  1209. vc_lo = NORM(z)
  1210. vc_hi = NORM(vc_hi + vd_hi + floor(z / 2^32))
  1211. vb_lo, vb_hi = XOR(vb_lo, vc_lo), XOR(vb_hi, vc_hi)
  1212. vb_lo, vb_hi = XOR(SHR(vb_lo, 24), SHL(vb_hi, 8)), XOR(SHR(vb_hi, 24), SHL(vb_lo, 8))
  1213. z = W[2*k2-1] + (va_lo % 2^32 + vb_lo % 2^32)
  1214. va_lo = NORM(z)
  1215. va_hi = NORM(W[2*k2] + (va_hi + vb_hi + floor(z / 2^32)))
  1216. vd_lo, vd_hi = XOR(vd_lo, va_lo), XOR(vd_hi, va_hi)
  1217. vd_lo, vd_hi = XOR(SHR(vd_lo, 16), SHL(vd_hi, 16)), XOR(SHR(vd_hi, 16), SHL(vd_lo, 16))
  1218. z = vc_lo % 2^32 + vd_lo % 2^32
  1219. vc_lo = NORM(z)
  1220. vc_hi = NORM(vc_hi + vd_hi + floor(z / 2^32))
  1221. vb_lo, vb_hi = XOR(vb_lo, vc_lo), XOR(vb_hi, vc_hi)
  1222. vb_lo, vb_hi = XOR(SHL(vb_lo, 1), SHR(vb_hi, 31)), XOR(SHL(vb_hi, 1), SHR(vb_lo, 31))
  1223. v_lo[a], v_lo[b], v_lo[c], v_lo[d] = va_lo, vb_lo, vc_lo, vd_lo
  1224. v_hi[a], v_hi[b], v_hi[c], v_hi[d] = va_hi, vb_hi, vc_hi, vd_hi
  1225. end
  1226. function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1227. -- offs >= 0, size >= 0, size is multiple of 128
  1228. local W = common_W
  1229. local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  1230. local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  1231. for pos = offs, offs + size - 1, 128 do
  1232. if str then
  1233. for j = 1, 32 do
  1234. pos = pos + 4
  1235. local a, b, c, d = byte(str, pos - 3, pos)
  1236. W[j] = d * 2^24 + OR(SHL(c, 16), SHL(b, 8), a)
  1237. end
  1238. end
  1239. v_lo[0x0], v_lo[0x1], v_lo[0x2], v_lo[0x3], v_lo[0x4], v_lo[0x5], v_lo[0x6], v_lo[0x7] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  1240. v_lo[0x8], v_lo[0x9], v_lo[0xA], v_lo[0xB], v_lo[0xC], v_lo[0xD], v_lo[0xE], v_lo[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  1241. v_hi[0x0], v_hi[0x1], v_hi[0x2], v_hi[0x3], v_hi[0x4], v_hi[0x5], v_hi[0x6], v_hi[0x7] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  1242. v_hi[0x8], v_hi[0x9], v_hi[0xA], v_hi[0xB], v_hi[0xC], v_hi[0xD], v_hi[0xE], v_hi[0xF] = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  1243. bytes_compressed = bytes_compressed + (last_block_size or 128)
  1244. local t0_lo = bytes_compressed % 2^32
  1245. local t0_hi = floor(bytes_compressed / 2^32)
  1246. v_lo[0xC] = XOR(v_lo[0xC], t0_lo) -- t0 = low_8_bytes(bytes_compressed)
  1247. v_hi[0xC] = XOR(v_hi[0xC], t0_hi)
  1248. -- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
  1249. if last_block_size then -- flag f0
  1250. v_lo[0xE] = NOT(v_lo[0xE])
  1251. v_hi[0xE] = NOT(v_hi[0xE])
  1252. end
  1253. if is_last_node then -- flag f1
  1254. v_lo[0xF] = NOT(v_lo[0xF])
  1255. v_hi[0xF] = NOT(v_hi[0xF])
  1256. end
  1257. for j = 1, 12 do
  1258. local row = sigma[j]
  1259. G(0, 4, 8, 12, row[ 1], row[ 2])
  1260. G(1, 5, 9, 13, row[ 3], row[ 4])
  1261. G(2, 6, 10, 14, row[ 5], row[ 6])
  1262. G(3, 7, 11, 15, row[ 7], row[ 8])
  1263. G(0, 5, 10, 15, row[ 9], row[10])
  1264. G(1, 6, 11, 12, row[11], row[12])
  1265. G(2, 7, 8, 13, row[13], row[14])
  1266. G(3, 4, 9, 14, row[15], row[16])
  1267. end
  1268. h1_lo = XOR(h1_lo, v_lo[0x0], v_lo[0x8])
  1269. h2_lo = XOR(h2_lo, v_lo[0x1], v_lo[0x9])
  1270. h3_lo = XOR(h3_lo, v_lo[0x2], v_lo[0xA])
  1271. h4_lo = XOR(h4_lo, v_lo[0x3], v_lo[0xB])
  1272. h5_lo = XOR(h5_lo, v_lo[0x4], v_lo[0xC])
  1273. h6_lo = XOR(h6_lo, v_lo[0x5], v_lo[0xD])
  1274. h7_lo = XOR(h7_lo, v_lo[0x6], v_lo[0xE])
  1275. h8_lo = XOR(h8_lo, v_lo[0x7], v_lo[0xF])
  1276. h1_hi = XOR(h1_hi, v_hi[0x0], v_hi[0x8])
  1277. h2_hi = XOR(h2_hi, v_hi[0x1], v_hi[0x9])
  1278. h3_hi = XOR(h3_hi, v_hi[0x2], v_hi[0xA])
  1279. h4_hi = XOR(h4_hi, v_hi[0x3], v_hi[0xB])
  1280. h5_hi = XOR(h5_hi, v_hi[0x4], v_hi[0xC])
  1281. h6_hi = XOR(h6_hi, v_hi[0x5], v_hi[0xD])
  1282. h7_hi = XOR(h7_hi, v_hi[0x6], v_hi[0xE])
  1283. h8_hi = XOR(h8_hi, v_hi[0x7], v_hi[0xF])
  1284. end
  1285. H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo % 2^32, h2_lo % 2^32, h3_lo % 2^32, h4_lo % 2^32, h5_lo % 2^32, h6_lo % 2^32, h7_lo % 2^32, h8_lo % 2^32
  1286. H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi % 2^32, h2_hi % 2^32, h3_hi % 2^32, h4_hi % 2^32, h5_hi % 2^32, h6_hi % 2^32, h7_hi % 2^32, h8_hi % 2^32
  1287. return bytes_compressed
  1288. end
  1289. end
  1290. end
  1291. if branch == "FFI" or branch == "LJ" then
  1292. -- BLAKE2s and BLAKE3 implementations for "LuaJIT with FFI" and "LuaJIT without FFI" branches
  1293. do
  1294. local W = common_W_blake2s
  1295. local v = v_for_blake2s_feed_64
  1296. local function G(a, b, c, d, k1, k2)
  1297. local va, vb, vc, vd = v[a], v[b], v[c], v[d]
  1298. va = NORM(W[k1] + (va + vb))
  1299. vd = ROR(XOR(vd, va), 16)
  1300. vc = NORM(vc + vd)
  1301. vb = ROR(XOR(vb, vc), 12)
  1302. va = NORM(W[k2] + (va + vb))
  1303. vd = ROR(XOR(vd, va), 8)
  1304. vc = NORM(vc + vd)
  1305. vb = ROR(XOR(vb, vc), 7)
  1306. v[a], v[b], v[c], v[d] = va, vb, vc, vd
  1307. end
  1308. function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1309. -- offs >= 0, size >= 0, size is multiple of 64
  1310. local h1, h2, h3, h4, h5, h6, h7, h8 = NORM(H[1]), NORM(H[2]), NORM(H[3]), NORM(H[4]), NORM(H[5]), NORM(H[6]), NORM(H[7]), NORM(H[8])
  1311. for pos = offs, offs + size - 1, 64 do
  1312. if str then
  1313. for j = 1, 16 do
  1314. pos = pos + 4
  1315. local a, b, c, d = byte(str, pos - 3, pos)
  1316. W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  1317. end
  1318. end
  1319. v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  1320. v[0x8], v[0x9], v[0xA], v[0xB], v[0xE], v[0xF] = NORM(sha2_H_hi[1]), NORM(sha2_H_hi[2]), NORM(sha2_H_hi[3]), NORM(sha2_H_hi[4]), NORM(sha2_H_hi[7]), NORM(sha2_H_hi[8])
  1321. bytes_compressed = bytes_compressed + (last_block_size or 64)
  1322. local t0 = bytes_compressed % 2^32
  1323. local t1 = floor(bytes_compressed / 2^32)
  1324. v[0xC] = XOR(sha2_H_hi[5], t0) -- t0 = low_4_bytes(bytes_compressed)
  1325. v[0xD] = XOR(sha2_H_hi[6], t1) -- t1 = high_4_bytes(bytes_compressed
  1326. if last_block_size then -- flag f0
  1327. v[0xE] = NOT(v[0xE])
  1328. end
  1329. if is_last_node then -- flag f1
  1330. v[0xF] = NOT(v[0xF])
  1331. end
  1332. for j = 1, 10 do
  1333. local row = sigma[j]
  1334. G(0, 4, 8, 12, row[ 1], row[ 2])
  1335. G(1, 5, 9, 13, row[ 3], row[ 4])
  1336. G(2, 6, 10, 14, row[ 5], row[ 6])
  1337. G(3, 7, 11, 15, row[ 7], row[ 8])
  1338. G(0, 5, 10, 15, row[ 9], row[10])
  1339. G(1, 6, 11, 12, row[11], row[12])
  1340. G(2, 7, 8, 13, row[13], row[14])
  1341. G(3, 4, 9, 14, row[15], row[16])
  1342. end
  1343. h1 = XOR(h1, v[0x0], v[0x8])
  1344. h2 = XOR(h2, v[0x1], v[0x9])
  1345. h3 = XOR(h3, v[0x2], v[0xA])
  1346. h4 = XOR(h4, v[0x3], v[0xB])
  1347. h5 = XOR(h5, v[0x4], v[0xC])
  1348. h6 = XOR(h6, v[0x5], v[0xD])
  1349. h7 = XOR(h7, v[0x6], v[0xE])
  1350. h8 = XOR(h8, v[0x7], v[0xF])
  1351. end
  1352. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1353. return bytes_compressed
  1354. end
  1355. function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  1356. -- offs >= 0, size >= 0, size is multiple of 64
  1357. block_length = block_length or 64
  1358. local h1, h2, h3, h4, h5, h6, h7, h8 = NORM(H_in[1]), NORM(H_in[2]), NORM(H_in[3]), NORM(H_in[4]), NORM(H_in[5]), NORM(H_in[6]), NORM(H_in[7]), NORM(H_in[8])
  1359. H_out = H_out or H_in
  1360. for pos = offs, offs + size - 1, 64 do
  1361. if str then
  1362. for j = 1, 16 do
  1363. pos = pos + 4
  1364. local a, b, c, d = byte(str, pos - 3, pos)
  1365. W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  1366. end
  1367. end
  1368. v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  1369. v[0x8], v[0x9], v[0xA], v[0xB] = NORM(sha2_H_hi[1]), NORM(sha2_H_hi[2]), NORM(sha2_H_hi[3]), NORM(sha2_H_hi[4])
  1370. v[0xC] = NORM(chunk_index % 2^32) -- t0 = low_4_bytes(chunk_index)
  1371. v[0xD] = floor(chunk_index / 2^32) -- t1 = high_4_bytes(chunk_index)
  1372. v[0xE], v[0xF] = block_length, flags
  1373. for j = 1, 7 do
  1374. G(0, 4, 8, 12, perm_blake3[j], perm_blake3[j + 14])
  1375. G(1, 5, 9, 13, perm_blake3[j + 1], perm_blake3[j + 2])
  1376. G(2, 6, 10, 14, perm_blake3[j + 16], perm_blake3[j + 7])
  1377. G(3, 7, 11, 15, perm_blake3[j + 15], perm_blake3[j + 17])
  1378. G(0, 5, 10, 15, perm_blake3[j + 21], perm_blake3[j + 5])
  1379. G(1, 6, 11, 12, perm_blake3[j + 3], perm_blake3[j + 6])
  1380. G(2, 7, 8, 13, perm_blake3[j + 4], perm_blake3[j + 18])
  1381. G(3, 4, 9, 14, perm_blake3[j + 19], perm_blake3[j + 20])
  1382. end
  1383. if wide_output then
  1384. H_out[ 9] = XOR(h1, v[0x8])
  1385. H_out[10] = XOR(h2, v[0x9])
  1386. H_out[11] = XOR(h3, v[0xA])
  1387. H_out[12] = XOR(h4, v[0xB])
  1388. H_out[13] = XOR(h5, v[0xC])
  1389. H_out[14] = XOR(h6, v[0xD])
  1390. H_out[15] = XOR(h7, v[0xE])
  1391. H_out[16] = XOR(h8, v[0xF])
  1392. end
  1393. h1 = XOR(v[0x0], v[0x8])
  1394. h2 = XOR(v[0x1], v[0x9])
  1395. h3 = XOR(v[0x2], v[0xA])
  1396. h4 = XOR(v[0x3], v[0xB])
  1397. h5 = XOR(v[0x4], v[0xC])
  1398. h6 = XOR(v[0x5], v[0xD])
  1399. h7 = XOR(v[0x6], v[0xE])
  1400. h8 = XOR(v[0x7], v[0xF])
  1401. end
  1402. H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1403. end
  1404. end
  1405. end
  1406. if branch == "INT64" then
  1407. -- implementation for Lua 5.3/5.4
  1408. hi_factor = 4294967296
  1409. hi_factor_keccak = 4294967296
  1410. lanes_index_base = 1
  1411. HEX64, XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64 = load[=[-- branch "INT64"
  1412. local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3 = ...
  1413. local string_format, string_unpack = string.format, string.unpack
  1414. local function HEX64(x)
  1415. return string_format("%016x", x)
  1416. end
  1417. local function XORA5(x, y)
  1418. return x ~ (y or 0xa5a5a5a5a5a5a5a5)
  1419. end
  1420. local function XOR_BYTE(x, y)
  1421. return x ~ y
  1422. end
  1423. local function sha256_feed_64(H, str, offs, size)
  1424. -- offs >= 0, size >= 0, size is multiple of 64
  1425. local W, K = common_W, sha2_K_hi
  1426. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1427. for pos = offs + 1, offs + size, 64 do
  1428. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1429. string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1430. for j = 17, 64 do
  1431. local a = W[j-15]
  1432. a = a<<32 | a
  1433. local b = W[j-2]
  1434. b = b<<32 | b
  1435. W[j] = (a>>7 ~ a>>18 ~ a>>35) + (b>>17 ~ b>>19 ~ b>>42) + W[j-7] + W[j-16] & (1<<32)-1
  1436. end
  1437. local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  1438. for j = 1, 64 do
  1439. e = e<<32 | e & (1<<32)-1
  1440. local z = (e>>6 ~ e>>11 ~ e>>25) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
  1441. h = g
  1442. g = f
  1443. f = e
  1444. e = z + d
  1445. d = c
  1446. c = b
  1447. b = a
  1448. a = a<<32 | a & (1<<32)-1
  1449. a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a>>13 ~ a>>22)
  1450. end
  1451. h1 = a + h1
  1452. h2 = b + h2
  1453. h3 = c + h3
  1454. h4 = d + h4
  1455. h5 = e + h5
  1456. h6 = f + h6
  1457. h7 = g + h7
  1458. h8 = h + h8
  1459. end
  1460. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1461. end
  1462. local function sha512_feed_128(H, _, str, offs, size)
  1463. -- offs >= 0, size >= 0, size is multiple of 128
  1464. local W, K = common_W, sha2_K_lo
  1465. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1466. for pos = offs + 1, offs + size, 128 do
  1467. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1468. string_unpack(">i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos)
  1469. for j = 17, 80 do
  1470. local a = W[j-15]
  1471. local b = W[j-2]
  1472. W[j] = (a >> 1 ~ a >> 7 ~ a >> 8 ~ a << 56 ~ a << 63) + (b >> 6 ~ b >> 19 ~ b >> 61 ~ b << 3 ~ b << 45) + W[j-7] + W[j-16]
  1473. end
  1474. local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  1475. for j = 1, 80 do
  1476. local z = (e >> 14 ~ e >> 18 ~ e >> 41 ~ e << 23 ~ e << 46 ~ e << 50) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
  1477. h = g
  1478. g = f
  1479. f = e
  1480. e = z + d
  1481. d = c
  1482. c = b
  1483. b = a
  1484. a = z + ((a ~ c) & d ~ a & c) + (a >> 28 ~ a >> 34 ~ a >> 39 ~ a << 25 ~ a << 30 ~ a << 36)
  1485. end
  1486. h1 = a + h1
  1487. h2 = b + h2
  1488. h3 = c + h3
  1489. h4 = d + h4
  1490. h5 = e + h5
  1491. h6 = f + h6
  1492. h7 = g + h7
  1493. h8 = h + h8
  1494. end
  1495. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1496. end
  1497. local function md5_feed_64(H, str, offs, size)
  1498. -- offs >= 0, size >= 0, size is multiple of 64
  1499. local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  1500. local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  1501. for pos = offs + 1, offs + size, 64 do
  1502. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1503. string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1504. local a, b, c, d = h1, h2, h3, h4
  1505. local s = 32-7
  1506. for j = 1, 16 do
  1507. local F = (d ~ b & (c ~ d)) + a + K[j] + W[j]
  1508. a = d
  1509. d = c
  1510. c = b
  1511. b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1512. s = md5_next_shift[s]
  1513. end
  1514. s = 32-5
  1515. for j = 17, 32 do
  1516. local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1]
  1517. a = d
  1518. d = c
  1519. c = b
  1520. b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1521. s = md5_next_shift[s]
  1522. end
  1523. s = 32-4
  1524. for j = 33, 48 do
  1525. local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1]
  1526. a = d
  1527. d = c
  1528. c = b
  1529. b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1530. s = md5_next_shift[s]
  1531. end
  1532. s = 32-6
  1533. for j = 49, 64 do
  1534. local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1]
  1535. a = d
  1536. d = c
  1537. c = b
  1538. b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1539. s = md5_next_shift[s]
  1540. end
  1541. h1 = a + h1
  1542. h2 = b + h2
  1543. h3 = c + h3
  1544. h4 = d + h4
  1545. end
  1546. H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  1547. end
  1548. local function sha1_feed_64(H, str, offs, size)
  1549. -- offs >= 0, size >= 0, size is multiple of 64
  1550. local W = common_W
  1551. local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
  1552. for pos = offs + 1, offs + size, 64 do
  1553. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1554. string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1555. for j = 17, 80 do
  1556. local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16]
  1557. W[j] = (a<<32 | a) << 1 >> 32
  1558. end
  1559. local a, b, c, d, e = h1, h2, h3, h4, h5
  1560. for j = 1, 20 do
  1561. local z = ((a<<32 | a & (1<<32)-1) >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2))
  1562. e = d
  1563. d = c
  1564. c = (b<<32 | b & (1<<32)-1) >> 2
  1565. b = a
  1566. a = z
  1567. end
  1568. for j = 21, 40 do
  1569. local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3)
  1570. e = d
  1571. d = c
  1572. c = (b<<32 | b & (1<<32)-1) >> 2
  1573. b = a
  1574. a = z
  1575. end
  1576. for j = 41, 60 do
  1577. local z = ((a<<32 | a & (1<<32)-1) >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5)
  1578. e = d
  1579. d = c
  1580. c = (b<<32 | b & (1<<32)-1) >> 2
  1581. b = a
  1582. a = z
  1583. end
  1584. for j = 61, 80 do
  1585. local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10)
  1586. e = d
  1587. d = c
  1588. c = (b<<32 | b & (1<<32)-1) >> 2
  1589. b = a
  1590. a = z
  1591. end
  1592. h1 = a + h1
  1593. h2 = b + h2
  1594. h3 = c + h3
  1595. h4 = d + h4
  1596. h5 = e + h5
  1597. end
  1598. H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
  1599. end
  1600. local keccak_format_i8 = build_keccak_format("i8")
  1601. local function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
  1602. -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  1603. local RC = sha3_RC_lo
  1604. local qwords_qty = block_size_in_bytes / 8
  1605. local keccak_format = keccak_format_i8[qwords_qty]
  1606. for pos = offs + 1, offs + size, block_size_in_bytes do
  1607. local qwords_from_message = {string_unpack(keccak_format, str, pos)}
  1608. for j = 1, qwords_qty do
  1609. lanes[j] = lanes[j] ~ qwords_from_message[j]
  1610. end
  1611. local L01, L02, L03, L04, L05, L06, L07, L08, L09, L10, L11, L12, L13, L14, L15, L16, L17, L18, L19, L20, L21, L22, L23, L24, L25 =
  1612. lanes[1], lanes[2], lanes[3], lanes[4], lanes[5], lanes[6], lanes[7], lanes[8], lanes[9], lanes[10], lanes[11], lanes[12], lanes[13],
  1613. lanes[14], lanes[15], lanes[16], lanes[17], lanes[18], lanes[19], lanes[20], lanes[21], lanes[22], lanes[23], lanes[24], lanes[25]
  1614. for round_idx = 1, 24 do
  1615. local C1 = L01 ~ L06 ~ L11 ~ L16 ~ L21
  1616. local C2 = L02 ~ L07 ~ L12 ~ L17 ~ L22
  1617. local C3 = L03 ~ L08 ~ L13 ~ L18 ~ L23
  1618. local C4 = L04 ~ L09 ~ L14 ~ L19 ~ L24
  1619. local C5 = L05 ~ L10 ~ L15 ~ L20 ~ L25
  1620. local D = C1 ~ C3<<1 ~ C3>>63
  1621. local T0 = D ~ L02
  1622. local T1 = D ~ L07
  1623. local T2 = D ~ L12
  1624. local T3 = D ~ L17
  1625. local T4 = D ~ L22
  1626. L02 = T1<<44 ~ T1>>20
  1627. L07 = T3<<45 ~ T3>>19
  1628. L12 = T0<<1 ~ T0>>63
  1629. L17 = T2<<10 ~ T2>>54
  1630. L22 = T4<<2 ~ T4>>62
  1631. D = C2 ~ C4<<1 ~ C4>>63
  1632. T0 = D ~ L03
  1633. T1 = D ~ L08
  1634. T2 = D ~ L13
  1635. T3 = D ~ L18
  1636. T4 = D ~ L23
  1637. L03 = T2<<43 ~ T2>>21
  1638. L08 = T4<<61 ~ T4>>3
  1639. L13 = T1<<6 ~ T1>>58
  1640. L18 = T3<<15 ~ T3>>49
  1641. L23 = T0<<62 ~ T0>>2
  1642. D = C3 ~ C5<<1 ~ C5>>63
  1643. T0 = D ~ L04
  1644. T1 = D ~ L09
  1645. T2 = D ~ L14
  1646. T3 = D ~ L19
  1647. T4 = D ~ L24
  1648. L04 = T3<<21 ~ T3>>43
  1649. L09 = T0<<28 ~ T0>>36
  1650. L14 = T2<<25 ~ T2>>39
  1651. L19 = T4<<56 ~ T4>>8
  1652. L24 = T1<<55 ~ T1>>9
  1653. D = C4 ~ C1<<1 ~ C1>>63
  1654. T0 = D ~ L05
  1655. T1 = D ~ L10
  1656. T2 = D ~ L15
  1657. T3 = D ~ L20
  1658. T4 = D ~ L25
  1659. L05 = T4<<14 ~ T4>>50
  1660. L10 = T1<<20 ~ T1>>44
  1661. L15 = T3<<8 ~ T3>>56
  1662. L20 = T0<<27 ~ T0>>37
  1663. L25 = T2<<39 ~ T2>>25
  1664. D = C5 ~ C2<<1 ~ C2>>63
  1665. T1 = D ~ L06
  1666. T2 = D ~ L11
  1667. T3 = D ~ L16
  1668. T4 = D ~ L21
  1669. L06 = T2<<3 ~ T2>>61
  1670. L11 = T4<<18 ~ T4>>46
  1671. L16 = T1<<36 ~ T1>>28
  1672. L21 = T3<<41 ~ T3>>23
  1673. L01 = D ~ L01
  1674. L01, L02, L03, L04, L05 = L01 ~ ~L02 & L03, L02 ~ ~L03 & L04, L03 ~ ~L04 & L05, L04 ~ ~L05 & L01, L05 ~ ~L01 & L02
  1675. L06, L07, L08, L09, L10 = L09 ~ ~L10 & L06, L10 ~ ~L06 & L07, L06 ~ ~L07 & L08, L07 ~ ~L08 & L09, L08 ~ ~L09 & L10
  1676. L11, L12, L13, L14, L15 = L12 ~ ~L13 & L14, L13 ~ ~L14 & L15, L14 ~ ~L15 & L11, L15 ~ ~L11 & L12, L11 ~ ~L12 & L13
  1677. L16, L17, L18, L19, L20 = L20 ~ ~L16 & L17, L16 ~ ~L17 & L18, L17 ~ ~L18 & L19, L18 ~ ~L19 & L20, L19 ~ ~L20 & L16
  1678. L21, L22, L23, L24, L25 = L23 ~ ~L24 & L25, L24 ~ ~L25 & L21, L25 ~ ~L21 & L22, L21 ~ ~L22 & L23, L22 ~ ~L23 & L24
  1679. L01 = L01 ~ RC[round_idx]
  1680. end
  1681. lanes[1] = L01
  1682. lanes[2] = L02
  1683. lanes[3] = L03
  1684. lanes[4] = L04
  1685. lanes[5] = L05
  1686. lanes[6] = L06
  1687. lanes[7] = L07
  1688. lanes[8] = L08
  1689. lanes[9] = L09
  1690. lanes[10] = L10
  1691. lanes[11] = L11
  1692. lanes[12] = L12
  1693. lanes[13] = L13
  1694. lanes[14] = L14
  1695. lanes[15] = L15
  1696. lanes[16] = L16
  1697. lanes[17] = L17
  1698. lanes[18] = L18
  1699. lanes[19] = L19
  1700. lanes[20] = L20
  1701. lanes[21] = L21
  1702. lanes[22] = L22
  1703. lanes[23] = L23
  1704. lanes[24] = L24
  1705. lanes[25] = L25
  1706. end
  1707. end
  1708. local function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1709. -- offs >= 0, size >= 0, size is multiple of 64
  1710. local W = common_W
  1711. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1712. for pos = offs + 1, offs + size, 64 do
  1713. if str then
  1714. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1715. string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1716. end
  1717. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  1718. local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  1719. bytes_compressed = bytes_compressed + (last_block_size or 64)
  1720. vC = vC ~ bytes_compressed -- t0 = low_4_bytes(bytes_compressed)
  1721. vD = vD ~ bytes_compressed >> 32 -- t1 = high_4_bytes(bytes_compressed)
  1722. if last_block_size then -- flag f0
  1723. vE = ~vE
  1724. end
  1725. if is_last_node then -- flag f1
  1726. vF = ~vF
  1727. end
  1728. for j = 1, 10 do
  1729. local row = sigma[j]
  1730. v0 = v0 + v4 + W[row[1]]
  1731. vC = vC ~ v0
  1732. vC = (vC & (1<<32)-1) >> 16 | vC << 16
  1733. v8 = v8 + vC
  1734. v4 = v4 ~ v8
  1735. v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  1736. v0 = v0 + v4 + W[row[2]]
  1737. vC = vC ~ v0
  1738. vC = (vC & (1<<32)-1) >> 8 | vC << 24
  1739. v8 = v8 + vC
  1740. v4 = v4 ~ v8
  1741. v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  1742. v1 = v1 + v5 + W[row[3]]
  1743. vD = vD ~ v1
  1744. vD = (vD & (1<<32)-1) >> 16 | vD << 16
  1745. v9 = v9 + vD
  1746. v5 = v5 ~ v9
  1747. v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  1748. v1 = v1 + v5 + W[row[4]]
  1749. vD = vD ~ v1
  1750. vD = (vD & (1<<32)-1) >> 8 | vD << 24
  1751. v9 = v9 + vD
  1752. v5 = v5 ~ v9
  1753. v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  1754. v2 = v2 + v6 + W[row[5]]
  1755. vE = vE ~ v2
  1756. vE = (vE & (1<<32)-1) >> 16 | vE << 16
  1757. vA = vA + vE
  1758. v6 = v6 ~ vA
  1759. v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  1760. v2 = v2 + v6 + W[row[6]]
  1761. vE = vE ~ v2
  1762. vE = (vE & (1<<32)-1) >> 8 | vE << 24
  1763. vA = vA + vE
  1764. v6 = v6 ~ vA
  1765. v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  1766. v3 = v3 + v7 + W[row[7]]
  1767. vF = vF ~ v3
  1768. vF = (vF & (1<<32)-1) >> 16 | vF << 16
  1769. vB = vB + vF
  1770. v7 = v7 ~ vB
  1771. v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  1772. v3 = v3 + v7 + W[row[8]]
  1773. vF = vF ~ v3
  1774. vF = (vF & (1<<32)-1) >> 8 | vF << 24
  1775. vB = vB + vF
  1776. v7 = v7 ~ vB
  1777. v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  1778. v0 = v0 + v5 + W[row[9]]
  1779. vF = vF ~ v0
  1780. vF = (vF & (1<<32)-1) >> 16 | vF << 16
  1781. vA = vA + vF
  1782. v5 = v5 ~ vA
  1783. v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  1784. v0 = v0 + v5 + W[row[10]]
  1785. vF = vF ~ v0
  1786. vF = (vF & (1<<32)-1) >> 8 | vF << 24
  1787. vA = vA + vF
  1788. v5 = v5 ~ vA
  1789. v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  1790. v1 = v1 + v6 + W[row[11]]
  1791. vC = vC ~ v1
  1792. vC = (vC & (1<<32)-1) >> 16 | vC << 16
  1793. vB = vB + vC
  1794. v6 = v6 ~ vB
  1795. v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  1796. v1 = v1 + v6 + W[row[12]]
  1797. vC = vC ~ v1
  1798. vC = (vC & (1<<32)-1) >> 8 | vC << 24
  1799. vB = vB + vC
  1800. v6 = v6 ~ vB
  1801. v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  1802. v2 = v2 + v7 + W[row[13]]
  1803. vD = vD ~ v2
  1804. vD = (vD & (1<<32)-1) >> 16 | vD << 16
  1805. v8 = v8 + vD
  1806. v7 = v7 ~ v8
  1807. v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  1808. v2 = v2 + v7 + W[row[14]]
  1809. vD = vD ~ v2
  1810. vD = (vD & (1<<32)-1) >> 8 | vD << 24
  1811. v8 = v8 + vD
  1812. v7 = v7 ~ v8
  1813. v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  1814. v3 = v3 + v4 + W[row[15]]
  1815. vE = vE ~ v3
  1816. vE = (vE & (1<<32)-1) >> 16 | vE << 16
  1817. v9 = v9 + vE
  1818. v4 = v4 ~ v9
  1819. v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  1820. v3 = v3 + v4 + W[row[16]]
  1821. vE = vE ~ v3
  1822. vE = (vE & (1<<32)-1) >> 8 | vE << 24
  1823. v9 = v9 + vE
  1824. v4 = v4 ~ v9
  1825. v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  1826. end
  1827. h1 = h1 ~ v0 ~ v8
  1828. h2 = h2 ~ v1 ~ v9
  1829. h3 = h3 ~ v2 ~ vA
  1830. h4 = h4 ~ v3 ~ vB
  1831. h5 = h5 ~ v4 ~ vC
  1832. h6 = h6 ~ v5 ~ vD
  1833. h7 = h7 ~ v6 ~ vE
  1834. h8 = h8 ~ v7 ~ vF
  1835. end
  1836. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1837. return bytes_compressed
  1838. end
  1839. local function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1840. -- offs >= 0, size >= 0, size is multiple of 128
  1841. local W = common_W
  1842. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1843. for pos = offs + 1, offs + size, 128 do
  1844. if str then
  1845. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1846. string_unpack("<i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos)
  1847. end
  1848. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  1849. local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  1850. bytes_compressed = bytes_compressed + (last_block_size or 128)
  1851. vC = vC ~ bytes_compressed -- t0 = low_8_bytes(bytes_compressed)
  1852. -- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
  1853. if last_block_size then -- flag f0
  1854. vE = ~vE
  1855. end
  1856. if is_last_node then -- flag f1
  1857. vF = ~vF
  1858. end
  1859. for j = 1, 12 do
  1860. local row = sigma[j]
  1861. v0 = v0 + v4 + W[row[1]]
  1862. vC = vC ~ v0
  1863. vC = vC >> 32 | vC << 32
  1864. v8 = v8 + vC
  1865. v4 = v4 ~ v8
  1866. v4 = v4 >> 24 | v4 << 40
  1867. v0 = v0 + v4 + W[row[2]]
  1868. vC = vC ~ v0
  1869. vC = vC >> 16 | vC << 48
  1870. v8 = v8 + vC
  1871. v4 = v4 ~ v8
  1872. v4 = v4 >> 63 | v4 << 1
  1873. v1 = v1 + v5 + W[row[3]]
  1874. vD = vD ~ v1
  1875. vD = vD >> 32 | vD << 32
  1876. v9 = v9 + vD
  1877. v5 = v5 ~ v9
  1878. v5 = v5 >> 24 | v5 << 40
  1879. v1 = v1 + v5 + W[row[4]]
  1880. vD = vD ~ v1
  1881. vD = vD >> 16 | vD << 48
  1882. v9 = v9 + vD
  1883. v5 = v5 ~ v9
  1884. v5 = v5 >> 63 | v5 << 1
  1885. v2 = v2 + v6 + W[row[5]]
  1886. vE = vE ~ v2
  1887. vE = vE >> 32 | vE << 32
  1888. vA = vA + vE
  1889. v6 = v6 ~ vA
  1890. v6 = v6 >> 24 | v6 << 40
  1891. v2 = v2 + v6 + W[row[6]]
  1892. vE = vE ~ v2
  1893. vE = vE >> 16 | vE << 48
  1894. vA = vA + vE
  1895. v6 = v6 ~ vA
  1896. v6 = v6 >> 63 | v6 << 1
  1897. v3 = v3 + v7 + W[row[7]]
  1898. vF = vF ~ v3
  1899. vF = vF >> 32 | vF << 32
  1900. vB = vB + vF
  1901. v7 = v7 ~ vB
  1902. v7 = v7 >> 24 | v7 << 40
  1903. v3 = v3 + v7 + W[row[8]]
  1904. vF = vF ~ v3
  1905. vF = vF >> 16 | vF << 48
  1906. vB = vB + vF
  1907. v7 = v7 ~ vB
  1908. v7 = v7 >> 63 | v7 << 1
  1909. v0 = v0 + v5 + W[row[9]]
  1910. vF = vF ~ v0
  1911. vF = vF >> 32 | vF << 32
  1912. vA = vA + vF
  1913. v5 = v5 ~ vA
  1914. v5 = v5 >> 24 | v5 << 40
  1915. v0 = v0 + v5 + W[row[10]]
  1916. vF = vF ~ v0
  1917. vF = vF >> 16 | vF << 48
  1918. vA = vA + vF
  1919. v5 = v5 ~ vA
  1920. v5 = v5 >> 63 | v5 << 1
  1921. v1 = v1 + v6 + W[row[11]]
  1922. vC = vC ~ v1
  1923. vC = vC >> 32 | vC << 32
  1924. vB = vB + vC
  1925. v6 = v6 ~ vB
  1926. v6 = v6 >> 24 | v6 << 40
  1927. v1 = v1 + v6 + W[row[12]]
  1928. vC = vC ~ v1
  1929. vC = vC >> 16 | vC << 48
  1930. vB = vB + vC
  1931. v6 = v6 ~ vB
  1932. v6 = v6 >> 63 | v6 << 1
  1933. v2 = v2 + v7 + W[row[13]]
  1934. vD = vD ~ v2
  1935. vD = vD >> 32 | vD << 32
  1936. v8 = v8 + vD
  1937. v7 = v7 ~ v8
  1938. v7 = v7 >> 24 | v7 << 40
  1939. v2 = v2 + v7 + W[row[14]]
  1940. vD = vD ~ v2
  1941. vD = vD >> 16 | vD << 48
  1942. v8 = v8 + vD
  1943. v7 = v7 ~ v8
  1944. v7 = v7 >> 63 | v7 << 1
  1945. v3 = v3 + v4 + W[row[15]]
  1946. vE = vE ~ v3
  1947. vE = vE >> 32 | vE << 32
  1948. v9 = v9 + vE
  1949. v4 = v4 ~ v9
  1950. v4 = v4 >> 24 | v4 << 40
  1951. v3 = v3 + v4 + W[row[16]]
  1952. vE = vE ~ v3
  1953. vE = vE >> 16 | vE << 48
  1954. v9 = v9 + vE
  1955. v4 = v4 ~ v9
  1956. v4 = v4 >> 63 | v4 << 1
  1957. end
  1958. h1 = h1 ~ v0 ~ v8
  1959. h2 = h2 ~ v1 ~ v9
  1960. h3 = h3 ~ v2 ~ vA
  1961. h4 = h4 ~ v3 ~ vB
  1962. h5 = h5 ~ v4 ~ vC
  1963. h6 = h6 ~ v5 ~ vD
  1964. h7 = h7 ~ v6 ~ vE
  1965. h8 = h8 ~ v7 ~ vF
  1966. end
  1967. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1968. return bytes_compressed
  1969. end
  1970. local function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  1971. -- offs >= 0, size >= 0, size is multiple of 64
  1972. block_length = block_length or 64
  1973. local W = common_W
  1974. local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
  1975. H_out = H_out or H_in
  1976. for pos = offs + 1, offs + size, 64 do
  1977. if str then
  1978. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1979. string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1980. end
  1981. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  1982. local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
  1983. local t0 = chunk_index % 2^32 -- t0 = low_4_bytes(chunk_index)
  1984. local t1 = (chunk_index - t0) / 2^32 -- t1 = high_4_bytes(chunk_index)
  1985. local vC, vD, vE, vF = 0|t0, 0|t1, block_length, flags
  1986. for j = 1, 7 do
  1987. v0 = v0 + v4 + W[perm_blake3[j]]
  1988. vC = vC ~ v0
  1989. vC = (vC & (1<<32)-1) >> 16 | vC << 16
  1990. v8 = v8 + vC
  1991. v4 = v4 ~ v8
  1992. v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  1993. v0 = v0 + v4 + W[perm_blake3[j + 14]]
  1994. vC = vC ~ v0
  1995. vC = (vC & (1<<32)-1) >> 8 | vC << 24
  1996. v8 = v8 + vC
  1997. v4 = v4 ~ v8
  1998. v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  1999. v1 = v1 + v5 + W[perm_blake3[j + 1]]
  2000. vD = vD ~ v1
  2001. vD = (vD & (1<<32)-1) >> 16 | vD << 16
  2002. v9 = v9 + vD
  2003. v5 = v5 ~ v9
  2004. v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  2005. v1 = v1 + v5 + W[perm_blake3[j + 2]]
  2006. vD = vD ~ v1
  2007. vD = (vD & (1<<32)-1) >> 8 | vD << 24
  2008. v9 = v9 + vD
  2009. v5 = v5 ~ v9
  2010. v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  2011. v2 = v2 + v6 + W[perm_blake3[j + 16]]
  2012. vE = vE ~ v2
  2013. vE = (vE & (1<<32)-1) >> 16 | vE << 16
  2014. vA = vA + vE
  2015. v6 = v6 ~ vA
  2016. v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  2017. v2 = v2 + v6 + W[perm_blake3[j + 7]]
  2018. vE = vE ~ v2
  2019. vE = (vE & (1<<32)-1) >> 8 | vE << 24
  2020. vA = vA + vE
  2021. v6 = v6 ~ vA
  2022. v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  2023. v3 = v3 + v7 + W[perm_blake3[j + 15]]
  2024. vF = vF ~ v3
  2025. vF = (vF & (1<<32)-1) >> 16 | vF << 16
  2026. vB = vB + vF
  2027. v7 = v7 ~ vB
  2028. v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  2029. v3 = v3 + v7 + W[perm_blake3[j + 17]]
  2030. vF = vF ~ v3
  2031. vF = (vF & (1<<32)-1) >> 8 | vF << 24
  2032. vB = vB + vF
  2033. v7 = v7 ~ vB
  2034. v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  2035. v0 = v0 + v5 + W[perm_blake3[j + 21]]
  2036. vF = vF ~ v0
  2037. vF = (vF & (1<<32)-1) >> 16 | vF << 16
  2038. vA = vA + vF
  2039. v5 = v5 ~ vA
  2040. v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  2041. v0 = v0 + v5 + W[perm_blake3[j + 5]]
  2042. vF = vF ~ v0
  2043. vF = (vF & (1<<32)-1) >> 8 | vF << 24
  2044. vA = vA + vF
  2045. v5 = v5 ~ vA
  2046. v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  2047. v1 = v1 + v6 + W[perm_blake3[j + 3]]
  2048. vC = vC ~ v1
  2049. vC = (vC & (1<<32)-1) >> 16 | vC << 16
  2050. vB = vB + vC
  2051. v6 = v6 ~ vB
  2052. v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  2053. v1 = v1 + v6 + W[perm_blake3[j + 6]]
  2054. vC = vC ~ v1
  2055. vC = (vC & (1<<32)-1) >> 8 | vC << 24
  2056. vB = vB + vC
  2057. v6 = v6 ~ vB
  2058. v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  2059. v2 = v2 + v7 + W[perm_blake3[j + 4]]
  2060. vD = vD ~ v2
  2061. vD = (vD & (1<<32)-1) >> 16 | vD << 16
  2062. v8 = v8 + vD
  2063. v7 = v7 ~ v8
  2064. v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  2065. v2 = v2 + v7 + W[perm_blake3[j + 18]]
  2066. vD = vD ~ v2
  2067. vD = (vD & (1<<32)-1) >> 8 | vD << 24
  2068. v8 = v8 + vD
  2069. v7 = v7 ~ v8
  2070. v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  2071. v3 = v3 + v4 + W[perm_blake3[j + 19]]
  2072. vE = vE ~ v3
  2073. vE = (vE & (1<<32)-1) >> 16 | vE << 16
  2074. v9 = v9 + vE
  2075. v4 = v4 ~ v9
  2076. v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  2077. v3 = v3 + v4 + W[perm_blake3[j + 20]]
  2078. vE = vE ~ v3
  2079. vE = (vE & (1<<32)-1) >> 8 | vE << 24
  2080. v9 = v9 + vE
  2081. v4 = v4 ~ v9
  2082. v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  2083. end
  2084. if wide_output then
  2085. H_out[ 9] = h1 ~ v8
  2086. H_out[10] = h2 ~ v9
  2087. H_out[11] = h3 ~ vA
  2088. H_out[12] = h4 ~ vB
  2089. H_out[13] = h5 ~ vC
  2090. H_out[14] = h6 ~ vD
  2091. H_out[15] = h7 ~ vE
  2092. H_out[16] = h8 ~ vF
  2093. end
  2094. h1 = v0 ~ v8
  2095. h2 = v1 ~ v9
  2096. h3 = v2 ~ vA
  2097. h4 = v3 ~ vB
  2098. h5 = v4 ~ vC
  2099. h6 = v5 ~ vD
  2100. h7 = v6 ~ vE
  2101. h8 = v7 ~ vF
  2102. end
  2103. H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2104. end
  2105. return HEX64, XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
  2106. ]=](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3)
  2107. end
  2108. if branch == "INT32" then
  2109. -- implementation for Lua 5.3/5.4 having non-standard numbers config "int32"+"double" (built with LUA_INT_TYPE=LUA_INT_INT)
  2110. K_lo_modulo = 2^32
  2111. function HEX(x) -- returns string of 8 lowercase hexadecimal digits
  2112. return string_format("%08x", x)
  2113. end
  2114. XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64 = load[=[-- branch "INT32"
  2115. local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3 = ...
  2116. local string_unpack, floor = string.unpack, math.floor
  2117. local function XORA5(x, y)
  2118. return x ~ (y and (y + 2^31) % 2^32 - 2^31 or 0xA5A5A5A5)
  2119. end
  2120. local function XOR_BYTE(x, y)
  2121. return x ~ y
  2122. end
  2123. local function sha256_feed_64(H, str, offs, size)
  2124. -- offs >= 0, size >= 0, size is multiple of 64
  2125. local W, K = common_W, sha2_K_hi
  2126. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  2127. for pos = offs + 1, offs + size, 64 do
  2128. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2129. string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2130. for j = 17, 64 do
  2131. local a, b = W[j-15], W[j-2]
  2132. W[j] = (a>>7 ~ a<<25 ~ a<<14 ~ a>>18 ~ a>>3) + (b<<15 ~ b>>17 ~ b<<13 ~ b>>19 ~ b>>10) + W[j-7] + W[j-16]
  2133. end
  2134. local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  2135. for j = 1, 64 do
  2136. local z = (e>>6 ~ e<<26 ~ e>>11 ~ e<<21 ~ e>>25 ~ e<<7) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
  2137. h = g
  2138. g = f
  2139. f = e
  2140. e = z + d
  2141. d = c
  2142. c = b
  2143. b = a
  2144. a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a<<30 ~ a>>13 ~ a<<19 ~ a<<10 ~ a>>22)
  2145. end
  2146. h1 = a + h1
  2147. h2 = b + h2
  2148. h3 = c + h3
  2149. h4 = d + h4
  2150. h5 = e + h5
  2151. h6 = f + h6
  2152. h7 = g + h7
  2153. h8 = h + h8
  2154. end
  2155. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2156. end
  2157. local function sha512_feed_128(H_lo, H_hi, str, offs, size)
  2158. -- offs >= 0, size >= 0, size is multiple of 128
  2159. -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  2160. local floor, W, K_lo, K_hi = floor, common_W, sha2_K_lo, sha2_K_hi
  2161. local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  2162. local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  2163. for pos = offs + 1, offs + size, 128 do
  2164. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16],
  2165. W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] =
  2166. string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2167. for jj = 17*2, 80*2, 2 do
  2168. local a_lo, a_hi, b_lo, b_hi = W[jj-30], W[jj-31], W[jj-4], W[jj-5]
  2169. local tmp =
  2170. (a_lo>>1 ~ a_hi<<31 ~ a_lo>>8 ~ a_hi<<24 ~ a_lo>>7 ~ a_hi<<25) % 2^32
  2171. + (b_lo>>19 ~ b_hi<<13 ~ b_lo<<3 ~ b_hi>>29 ~ b_lo>>6 ~ b_hi<<26) % 2^32
  2172. + W[jj-14] % 2^32 + W[jj-32] % 2^32
  2173. W[jj-1] =
  2174. (a_hi>>1 ~ a_lo<<31 ~ a_hi>>8 ~ a_lo<<24 ~ a_hi>>7)
  2175. + (b_hi>>19 ~ b_lo<<13 ~ b_hi<<3 ~ b_lo>>29 ~ b_hi>>6)
  2176. + W[jj-15] + W[jj-33] + floor(tmp / 2^32)
  2177. W[jj] = 0|((tmp + 2^31) % 2^32 - 2^31)
  2178. end
  2179. local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2180. local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2181. for j = 1, 80 do
  2182. local jj = 2*j
  2183. local z_lo = (e_lo>>14 ~ e_hi<<18 ~ e_lo>>18 ~ e_hi<<14 ~ e_lo<<23 ~ e_hi>>9) % 2^32 + (g_lo ~ e_lo & (f_lo ~ g_lo)) % 2^32 + h_lo % 2^32 + K_lo[j] + W[jj] % 2^32
  2184. local z_hi = (e_hi>>14 ~ e_lo<<18 ~ e_hi>>18 ~ e_lo<<14 ~ e_hi<<23 ~ e_lo>>9) + (g_hi ~ e_hi & (f_hi ~ g_hi)) + h_hi + K_hi[j] + W[jj-1] + floor(z_lo / 2^32)
  2185. z_lo = z_lo % 2^32
  2186. h_lo = g_lo; h_hi = g_hi
  2187. g_lo = f_lo; g_hi = f_hi
  2188. f_lo = e_lo; f_hi = e_hi
  2189. e_lo = z_lo + d_lo % 2^32
  2190. e_hi = z_hi + d_hi + floor(e_lo / 2^32)
  2191. e_lo = 0|((e_lo + 2^31) % 2^32 - 2^31)
  2192. d_lo = c_lo; d_hi = c_hi
  2193. c_lo = b_lo; c_hi = b_hi
  2194. b_lo = a_lo; b_hi = a_hi
  2195. z_lo = z_lo + (d_lo & c_lo ~ b_lo & (d_lo ~ c_lo)) % 2^32 + (b_lo>>28 ~ b_hi<<4 ~ b_lo<<30 ~ b_hi>>2 ~ b_lo<<25 ~ b_hi>>7) % 2^32
  2196. a_hi = z_hi + (d_hi & c_hi ~ b_hi & (d_hi ~ c_hi)) + (b_hi>>28 ~ b_lo<<4 ~ b_hi<<30 ~ b_lo>>2 ~ b_hi<<25 ~ b_lo>>7) + floor(z_lo / 2^32)
  2197. a_lo = 0|((z_lo + 2^31) % 2^32 - 2^31)
  2198. end
  2199. a_lo = h1_lo % 2^32 + a_lo % 2^32
  2200. h1_hi = h1_hi + a_hi + floor(a_lo / 2^32)
  2201. h1_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2202. a_lo = h2_lo % 2^32 + b_lo % 2^32
  2203. h2_hi = h2_hi + b_hi + floor(a_lo / 2^32)
  2204. h2_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2205. a_lo = h3_lo % 2^32 + c_lo % 2^32
  2206. h3_hi = h3_hi + c_hi + floor(a_lo / 2^32)
  2207. h3_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2208. a_lo = h4_lo % 2^32 + d_lo % 2^32
  2209. h4_hi = h4_hi + d_hi + floor(a_lo / 2^32)
  2210. h4_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2211. a_lo = h5_lo % 2^32 + e_lo % 2^32
  2212. h5_hi = h5_hi + e_hi + floor(a_lo / 2^32)
  2213. h5_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2214. a_lo = h6_lo % 2^32 + f_lo % 2^32
  2215. h6_hi = h6_hi + f_hi + floor(a_lo / 2^32)
  2216. h6_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2217. a_lo = h7_lo % 2^32 + g_lo % 2^32
  2218. h7_hi = h7_hi + g_hi + floor(a_lo / 2^32)
  2219. h7_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2220. a_lo = h8_lo % 2^32 + h_lo % 2^32
  2221. h8_hi = h8_hi + h_hi + floor(a_lo / 2^32)
  2222. h8_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2223. end
  2224. H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2225. H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2226. end
  2227. local function md5_feed_64(H, str, offs, size)
  2228. -- offs >= 0, size >= 0, size is multiple of 64
  2229. local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  2230. local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  2231. for pos = offs + 1, offs + size, 64 do
  2232. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2233. string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2234. local a, b, c, d = h1, h2, h3, h4
  2235. local s = 32-7
  2236. for j = 1, 16 do
  2237. local F = (d ~ b & (c ~ d)) + a + K[j] + W[j]
  2238. a = d
  2239. d = c
  2240. c = b
  2241. b = (F << 32-s | F>>s) + b
  2242. s = md5_next_shift[s]
  2243. end
  2244. s = 32-5
  2245. for j = 17, 32 do
  2246. local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1]
  2247. a = d
  2248. d = c
  2249. c = b
  2250. b = (F << 32-s | F>>s) + b
  2251. s = md5_next_shift[s]
  2252. end
  2253. s = 32-4
  2254. for j = 33, 48 do
  2255. local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1]
  2256. a = d
  2257. d = c
  2258. c = b
  2259. b = (F << 32-s | F>>s) + b
  2260. s = md5_next_shift[s]
  2261. end
  2262. s = 32-6
  2263. for j = 49, 64 do
  2264. local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1]
  2265. a = d
  2266. d = c
  2267. c = b
  2268. b = (F << 32-s | F>>s) + b
  2269. s = md5_next_shift[s]
  2270. end
  2271. h1 = a + h1
  2272. h2 = b + h2
  2273. h3 = c + h3
  2274. h4 = d + h4
  2275. end
  2276. H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  2277. end
  2278. local function sha1_feed_64(H, str, offs, size)
  2279. -- offs >= 0, size >= 0, size is multiple of 64
  2280. local W = common_W
  2281. local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
  2282. for pos = offs + 1, offs + size, 64 do
  2283. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2284. string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2285. for j = 17, 80 do
  2286. local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16]
  2287. W[j] = a << 1 ~ a >> 31
  2288. end
  2289. local a, b, c, d, e = h1, h2, h3, h4, h5
  2290. for j = 1, 20 do
  2291. local z = (a << 5 ~ a >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2))
  2292. e = d
  2293. d = c
  2294. c = b << 30 ~ b >> 2
  2295. b = a
  2296. a = z
  2297. end
  2298. for j = 21, 40 do
  2299. local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3)
  2300. e = d
  2301. d = c
  2302. c = b << 30 ~ b >> 2
  2303. b = a
  2304. a = z
  2305. end
  2306. for j = 41, 60 do
  2307. local z = (a << 5 ~ a >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5)
  2308. e = d
  2309. d = c
  2310. c = b << 30 ~ b >> 2
  2311. b = a
  2312. a = z
  2313. end
  2314. for j = 61, 80 do
  2315. local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10)
  2316. e = d
  2317. d = c
  2318. c = b << 30 ~ b >> 2
  2319. b = a
  2320. a = z
  2321. end
  2322. h1 = a + h1
  2323. h2 = b + h2
  2324. h3 = c + h3
  2325. h4 = d + h4
  2326. h5 = e + h5
  2327. end
  2328. H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
  2329. end
  2330. local keccak_format_i4i4 = build_keccak_format("i4i4")
  2331. local function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
  2332. -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  2333. local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
  2334. local qwords_qty = block_size_in_bytes / 8
  2335. local keccak_format = keccak_format_i4i4[qwords_qty]
  2336. for pos = offs + 1, offs + size, block_size_in_bytes do
  2337. local dwords_from_message = {string_unpack(keccak_format, str, pos)}
  2338. for j = 1, qwords_qty do
  2339. lanes_lo[j] = lanes_lo[j] ~ dwords_from_message[2*j-1]
  2340. lanes_hi[j] = lanes_hi[j] ~ dwords_from_message[2*j]
  2341. end
  2342. local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi,
  2343. L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi,
  2344. L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi =
  2345. lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5],
  2346. lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10],
  2347. lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15],
  2348. lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20],
  2349. lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25]
  2350. for round_idx = 1, 24 do
  2351. local C1_lo = L01_lo ~ L06_lo ~ L11_lo ~ L16_lo ~ L21_lo
  2352. local C1_hi = L01_hi ~ L06_hi ~ L11_hi ~ L16_hi ~ L21_hi
  2353. local C2_lo = L02_lo ~ L07_lo ~ L12_lo ~ L17_lo ~ L22_lo
  2354. local C2_hi = L02_hi ~ L07_hi ~ L12_hi ~ L17_hi ~ L22_hi
  2355. local C3_lo = L03_lo ~ L08_lo ~ L13_lo ~ L18_lo ~ L23_lo
  2356. local C3_hi = L03_hi ~ L08_hi ~ L13_hi ~ L18_hi ~ L23_hi
  2357. local C4_lo = L04_lo ~ L09_lo ~ L14_lo ~ L19_lo ~ L24_lo
  2358. local C4_hi = L04_hi ~ L09_hi ~ L14_hi ~ L19_hi ~ L24_hi
  2359. local C5_lo = L05_lo ~ L10_lo ~ L15_lo ~ L20_lo ~ L25_lo
  2360. local C5_hi = L05_hi ~ L10_hi ~ L15_hi ~ L20_hi ~ L25_hi
  2361. local D_lo = C1_lo ~ C3_lo<<1 ~ C3_hi>>31
  2362. local D_hi = C1_hi ~ C3_hi<<1 ~ C3_lo>>31
  2363. local T0_lo = D_lo ~ L02_lo
  2364. local T0_hi = D_hi ~ L02_hi
  2365. local T1_lo = D_lo ~ L07_lo
  2366. local T1_hi = D_hi ~ L07_hi
  2367. local T2_lo = D_lo ~ L12_lo
  2368. local T2_hi = D_hi ~ L12_hi
  2369. local T3_lo = D_lo ~ L17_lo
  2370. local T3_hi = D_hi ~ L17_hi
  2371. local T4_lo = D_lo ~ L22_lo
  2372. local T4_hi = D_hi ~ L22_hi
  2373. L02_lo = T1_lo>>20 ~ T1_hi<<12
  2374. L02_hi = T1_hi>>20 ~ T1_lo<<12
  2375. L07_lo = T3_lo>>19 ~ T3_hi<<13
  2376. L07_hi = T3_hi>>19 ~ T3_lo<<13
  2377. L12_lo = T0_lo<<1 ~ T0_hi>>31
  2378. L12_hi = T0_hi<<1 ~ T0_lo>>31
  2379. L17_lo = T2_lo<<10 ~ T2_hi>>22
  2380. L17_hi = T2_hi<<10 ~ T2_lo>>22
  2381. L22_lo = T4_lo<<2 ~ T4_hi>>30
  2382. L22_hi = T4_hi<<2 ~ T4_lo>>30
  2383. D_lo = C2_lo ~ C4_lo<<1 ~ C4_hi>>31
  2384. D_hi = C2_hi ~ C4_hi<<1 ~ C4_lo>>31
  2385. T0_lo = D_lo ~ L03_lo
  2386. T0_hi = D_hi ~ L03_hi
  2387. T1_lo = D_lo ~ L08_lo
  2388. T1_hi = D_hi ~ L08_hi
  2389. T2_lo = D_lo ~ L13_lo
  2390. T2_hi = D_hi ~ L13_hi
  2391. T3_lo = D_lo ~ L18_lo
  2392. T3_hi = D_hi ~ L18_hi
  2393. T4_lo = D_lo ~ L23_lo
  2394. T4_hi = D_hi ~ L23_hi
  2395. L03_lo = T2_lo>>21 ~ T2_hi<<11
  2396. L03_hi = T2_hi>>21 ~ T2_lo<<11
  2397. L08_lo = T4_lo>>3 ~ T4_hi<<29
  2398. L08_hi = T4_hi>>3 ~ T4_lo<<29
  2399. L13_lo = T1_lo<<6 ~ T1_hi>>26
  2400. L13_hi = T1_hi<<6 ~ T1_lo>>26
  2401. L18_lo = T3_lo<<15 ~ T3_hi>>17
  2402. L18_hi = T3_hi<<15 ~ T3_lo>>17
  2403. L23_lo = T0_lo>>2 ~ T0_hi<<30
  2404. L23_hi = T0_hi>>2 ~ T0_lo<<30
  2405. D_lo = C3_lo ~ C5_lo<<1 ~ C5_hi>>31
  2406. D_hi = C3_hi ~ C5_hi<<1 ~ C5_lo>>31
  2407. T0_lo = D_lo ~ L04_lo
  2408. T0_hi = D_hi ~ L04_hi
  2409. T1_lo = D_lo ~ L09_lo
  2410. T1_hi = D_hi ~ L09_hi
  2411. T2_lo = D_lo ~ L14_lo
  2412. T2_hi = D_hi ~ L14_hi
  2413. T3_lo = D_lo ~ L19_lo
  2414. T3_hi = D_hi ~ L19_hi
  2415. T4_lo = D_lo ~ L24_lo
  2416. T4_hi = D_hi ~ L24_hi
  2417. L04_lo = T3_lo<<21 ~ T3_hi>>11
  2418. L04_hi = T3_hi<<21 ~ T3_lo>>11
  2419. L09_lo = T0_lo<<28 ~ T0_hi>>4
  2420. L09_hi = T0_hi<<28 ~ T0_lo>>4
  2421. L14_lo = T2_lo<<25 ~ T2_hi>>7
  2422. L14_hi = T2_hi<<25 ~ T2_lo>>7
  2423. L19_lo = T4_lo>>8 ~ T4_hi<<24
  2424. L19_hi = T4_hi>>8 ~ T4_lo<<24
  2425. L24_lo = T1_lo>>9 ~ T1_hi<<23
  2426. L24_hi = T1_hi>>9 ~ T1_lo<<23
  2427. D_lo = C4_lo ~ C1_lo<<1 ~ C1_hi>>31
  2428. D_hi = C4_hi ~ C1_hi<<1 ~ C1_lo>>31
  2429. T0_lo = D_lo ~ L05_lo
  2430. T0_hi = D_hi ~ L05_hi
  2431. T1_lo = D_lo ~ L10_lo
  2432. T1_hi = D_hi ~ L10_hi
  2433. T2_lo = D_lo ~ L15_lo
  2434. T2_hi = D_hi ~ L15_hi
  2435. T3_lo = D_lo ~ L20_lo
  2436. T3_hi = D_hi ~ L20_hi
  2437. T4_lo = D_lo ~ L25_lo
  2438. T4_hi = D_hi ~ L25_hi
  2439. L05_lo = T4_lo<<14 ~ T4_hi>>18
  2440. L05_hi = T4_hi<<14 ~ T4_lo>>18
  2441. L10_lo = T1_lo<<20 ~ T1_hi>>12
  2442. L10_hi = T1_hi<<20 ~ T1_lo>>12
  2443. L15_lo = T3_lo<<8 ~ T3_hi>>24
  2444. L15_hi = T3_hi<<8 ~ T3_lo>>24
  2445. L20_lo = T0_lo<<27 ~ T0_hi>>5
  2446. L20_hi = T0_hi<<27 ~ T0_lo>>5
  2447. L25_lo = T2_lo>>25 ~ T2_hi<<7
  2448. L25_hi = T2_hi>>25 ~ T2_lo<<7
  2449. D_lo = C5_lo ~ C2_lo<<1 ~ C2_hi>>31
  2450. D_hi = C5_hi ~ C2_hi<<1 ~ C2_lo>>31
  2451. T1_lo = D_lo ~ L06_lo
  2452. T1_hi = D_hi ~ L06_hi
  2453. T2_lo = D_lo ~ L11_lo
  2454. T2_hi = D_hi ~ L11_hi
  2455. T3_lo = D_lo ~ L16_lo
  2456. T3_hi = D_hi ~ L16_hi
  2457. T4_lo = D_lo ~ L21_lo
  2458. T4_hi = D_hi ~ L21_hi
  2459. L06_lo = T2_lo<<3 ~ T2_hi>>29
  2460. L06_hi = T2_hi<<3 ~ T2_lo>>29
  2461. L11_lo = T4_lo<<18 ~ T4_hi>>14
  2462. L11_hi = T4_hi<<18 ~ T4_lo>>14
  2463. L16_lo = T1_lo>>28 ~ T1_hi<<4
  2464. L16_hi = T1_hi>>28 ~ T1_lo<<4
  2465. L21_lo = T3_lo>>23 ~ T3_hi<<9
  2466. L21_hi = T3_hi>>23 ~ T3_lo<<9
  2467. L01_lo = D_lo ~ L01_lo
  2468. L01_hi = D_hi ~ L01_hi
  2469. L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = L01_lo ~ ~L02_lo & L03_lo, L02_lo ~ ~L03_lo & L04_lo, L03_lo ~ ~L04_lo & L05_lo, L04_lo ~ ~L05_lo & L01_lo, L05_lo ~ ~L01_lo & L02_lo
  2470. L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = L01_hi ~ ~L02_hi & L03_hi, L02_hi ~ ~L03_hi & L04_hi, L03_hi ~ ~L04_hi & L05_hi, L04_hi ~ ~L05_hi & L01_hi, L05_hi ~ ~L01_hi & L02_hi
  2471. L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = L09_lo ~ ~L10_lo & L06_lo, L10_lo ~ ~L06_lo & L07_lo, L06_lo ~ ~L07_lo & L08_lo, L07_lo ~ ~L08_lo & L09_lo, L08_lo ~ ~L09_lo & L10_lo
  2472. L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = L09_hi ~ ~L10_hi & L06_hi, L10_hi ~ ~L06_hi & L07_hi, L06_hi ~ ~L07_hi & L08_hi, L07_hi ~ ~L08_hi & L09_hi, L08_hi ~ ~L09_hi & L10_hi
  2473. L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = L12_lo ~ ~L13_lo & L14_lo, L13_lo ~ ~L14_lo & L15_lo, L14_lo ~ ~L15_lo & L11_lo, L15_lo ~ ~L11_lo & L12_lo, L11_lo ~ ~L12_lo & L13_lo
  2474. L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = L12_hi ~ ~L13_hi & L14_hi, L13_hi ~ ~L14_hi & L15_hi, L14_hi ~ ~L15_hi & L11_hi, L15_hi ~ ~L11_hi & L12_hi, L11_hi ~ ~L12_hi & L13_hi
  2475. L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = L20_lo ~ ~L16_lo & L17_lo, L16_lo ~ ~L17_lo & L18_lo, L17_lo ~ ~L18_lo & L19_lo, L18_lo ~ ~L19_lo & L20_lo, L19_lo ~ ~L20_lo & L16_lo
  2476. L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = L20_hi ~ ~L16_hi & L17_hi, L16_hi ~ ~L17_hi & L18_hi, L17_hi ~ ~L18_hi & L19_hi, L18_hi ~ ~L19_hi & L20_hi, L19_hi ~ ~L20_hi & L16_hi
  2477. L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = L23_lo ~ ~L24_lo & L25_lo, L24_lo ~ ~L25_lo & L21_lo, L25_lo ~ ~L21_lo & L22_lo, L21_lo ~ ~L22_lo & L23_lo, L22_lo ~ ~L23_lo & L24_lo
  2478. L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = L23_hi ~ ~L24_hi & L25_hi, L24_hi ~ ~L25_hi & L21_hi, L25_hi ~ ~L21_hi & L22_hi, L21_hi ~ ~L22_hi & L23_hi, L22_hi ~ ~L23_hi & L24_hi
  2479. L01_lo = L01_lo ~ RC_lo[round_idx]
  2480. L01_hi = L01_hi ~ RC_hi[round_idx]
  2481. end
  2482. lanes_lo[1] = L01_lo; lanes_hi[1] = L01_hi
  2483. lanes_lo[2] = L02_lo; lanes_hi[2] = L02_hi
  2484. lanes_lo[3] = L03_lo; lanes_hi[3] = L03_hi
  2485. lanes_lo[4] = L04_lo; lanes_hi[4] = L04_hi
  2486. lanes_lo[5] = L05_lo; lanes_hi[5] = L05_hi
  2487. lanes_lo[6] = L06_lo; lanes_hi[6] = L06_hi
  2488. lanes_lo[7] = L07_lo; lanes_hi[7] = L07_hi
  2489. lanes_lo[8] = L08_lo; lanes_hi[8] = L08_hi
  2490. lanes_lo[9] = L09_lo; lanes_hi[9] = L09_hi
  2491. lanes_lo[10] = L10_lo; lanes_hi[10] = L10_hi
  2492. lanes_lo[11] = L11_lo; lanes_hi[11] = L11_hi
  2493. lanes_lo[12] = L12_lo; lanes_hi[12] = L12_hi
  2494. lanes_lo[13] = L13_lo; lanes_hi[13] = L13_hi
  2495. lanes_lo[14] = L14_lo; lanes_hi[14] = L14_hi
  2496. lanes_lo[15] = L15_lo; lanes_hi[15] = L15_hi
  2497. lanes_lo[16] = L16_lo; lanes_hi[16] = L16_hi
  2498. lanes_lo[17] = L17_lo; lanes_hi[17] = L17_hi
  2499. lanes_lo[18] = L18_lo; lanes_hi[18] = L18_hi
  2500. lanes_lo[19] = L19_lo; lanes_hi[19] = L19_hi
  2501. lanes_lo[20] = L20_lo; lanes_hi[20] = L20_hi
  2502. lanes_lo[21] = L21_lo; lanes_hi[21] = L21_hi
  2503. lanes_lo[22] = L22_lo; lanes_hi[22] = L22_hi
  2504. lanes_lo[23] = L23_lo; lanes_hi[23] = L23_hi
  2505. lanes_lo[24] = L24_lo; lanes_hi[24] = L24_hi
  2506. lanes_lo[25] = L25_lo; lanes_hi[25] = L25_hi
  2507. end
  2508. end
  2509. local function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  2510. -- offs >= 0, size >= 0, size is multiple of 64
  2511. local W = common_W
  2512. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  2513. for pos = offs + 1, offs + size, 64 do
  2514. if str then
  2515. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2516. string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2517. end
  2518. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  2519. local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  2520. bytes_compressed = bytes_compressed + (last_block_size or 64)
  2521. local t0 = bytes_compressed % 2^32
  2522. local t1 = (bytes_compressed - t0) / 2^32
  2523. t0 = (t0 + 2^31) % 2^32 - 2^31 -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while XORing
  2524. vC = vC ~ t0 -- t0 = low_4_bytes(bytes_compressed)
  2525. vD = vD ~ t1 -- t1 = high_4_bytes(bytes_compressed)
  2526. if last_block_size then -- flag f0
  2527. vE = ~vE
  2528. end
  2529. if is_last_node then -- flag f1
  2530. vF = ~vF
  2531. end
  2532. for j = 1, 10 do
  2533. local row = sigma[j]
  2534. v0 = v0 + v4 + W[row[1]]
  2535. vC = vC ~ v0
  2536. vC = vC >> 16 | vC << 16
  2537. v8 = v8 + vC
  2538. v4 = v4 ~ v8
  2539. v4 = v4 >> 12 | v4 << 20
  2540. v0 = v0 + v4 + W[row[2]]
  2541. vC = vC ~ v0
  2542. vC = vC >> 8 | vC << 24
  2543. v8 = v8 + vC
  2544. v4 = v4 ~ v8
  2545. v4 = v4 >> 7 | v4 << 25
  2546. v1 = v1 + v5 + W[row[3]]
  2547. vD = vD ~ v1
  2548. vD = vD >> 16 | vD << 16
  2549. v9 = v9 + vD
  2550. v5 = v5 ~ v9
  2551. v5 = v5 >> 12 | v5 << 20
  2552. v1 = v1 + v5 + W[row[4]]
  2553. vD = vD ~ v1
  2554. vD = vD >> 8 | vD << 24
  2555. v9 = v9 + vD
  2556. v5 = v5 ~ v9
  2557. v5 = v5 >> 7 | v5 << 25
  2558. v2 = v2 + v6 + W[row[5]]
  2559. vE = vE ~ v2
  2560. vE = vE >> 16 | vE << 16
  2561. vA = vA + vE
  2562. v6 = v6 ~ vA
  2563. v6 = v6 >> 12 | v6 << 20
  2564. v2 = v2 + v6 + W[row[6]]
  2565. vE = vE ~ v2
  2566. vE = vE >> 8 | vE << 24
  2567. vA = vA + vE
  2568. v6 = v6 ~ vA
  2569. v6 = v6 >> 7 | v6 << 25
  2570. v3 = v3 + v7 + W[row[7]]
  2571. vF = vF ~ v3
  2572. vF = vF >> 16 | vF << 16
  2573. vB = vB + vF
  2574. v7 = v7 ~ vB
  2575. v7 = v7 >> 12 | v7 << 20
  2576. v3 = v3 + v7 + W[row[8]]
  2577. vF = vF ~ v3
  2578. vF = vF >> 8 | vF << 24
  2579. vB = vB + vF
  2580. v7 = v7 ~ vB
  2581. v7 = v7 >> 7 | v7 << 25
  2582. v0 = v0 + v5 + W[row[9]]
  2583. vF = vF ~ v0
  2584. vF = vF >> 16 | vF << 16
  2585. vA = vA + vF
  2586. v5 = v5 ~ vA
  2587. v5 = v5 >> 12 | v5 << 20
  2588. v0 = v0 + v5 + W[row[10]]
  2589. vF = vF ~ v0
  2590. vF = vF >> 8 | vF << 24
  2591. vA = vA + vF
  2592. v5 = v5 ~ vA
  2593. v5 = v5 >> 7 | v5 << 25
  2594. v1 = v1 + v6 + W[row[11]]
  2595. vC = vC ~ v1
  2596. vC = vC >> 16 | vC << 16
  2597. vB = vB + vC
  2598. v6 = v6 ~ vB
  2599. v6 = v6 >> 12 | v6 << 20
  2600. v1 = v1 + v6 + W[row[12]]
  2601. vC = vC ~ v1
  2602. vC = vC >> 8 | vC << 24
  2603. vB = vB + vC
  2604. v6 = v6 ~ vB
  2605. v6 = v6 >> 7 | v6 << 25
  2606. v2 = v2 + v7 + W[row[13]]
  2607. vD = vD ~ v2
  2608. vD = vD >> 16 | vD << 16
  2609. v8 = v8 + vD
  2610. v7 = v7 ~ v8
  2611. v7 = v7 >> 12 | v7 << 20
  2612. v2 = v2 + v7 + W[row[14]]
  2613. vD = vD ~ v2
  2614. vD = vD >> 8 | vD << 24
  2615. v8 = v8 + vD
  2616. v7 = v7 ~ v8
  2617. v7 = v7 >> 7 | v7 << 25
  2618. v3 = v3 + v4 + W[row[15]]
  2619. vE = vE ~ v3
  2620. vE = vE >> 16 | vE << 16
  2621. v9 = v9 + vE
  2622. v4 = v4 ~ v9
  2623. v4 = v4 >> 12 | v4 << 20
  2624. v3 = v3 + v4 + W[row[16]]
  2625. vE = vE ~ v3
  2626. vE = vE >> 8 | vE << 24
  2627. v9 = v9 + vE
  2628. v4 = v4 ~ v9
  2629. v4 = v4 >> 7 | v4 << 25
  2630. end
  2631. h1 = h1 ~ v0 ~ v8
  2632. h2 = h2 ~ v1 ~ v9
  2633. h3 = h3 ~ v2 ~ vA
  2634. h4 = h4 ~ v3 ~ vB
  2635. h5 = h5 ~ v4 ~ vC
  2636. h6 = h6 ~ v5 ~ vD
  2637. h7 = h7 ~ v6 ~ vE
  2638. h8 = h8 ~ v7 ~ vF
  2639. end
  2640. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2641. return bytes_compressed
  2642. end
  2643. local function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  2644. -- offs >= 0, size >= 0, size is multiple of 128
  2645. local W = common_W
  2646. local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  2647. local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  2648. for pos = offs + 1, offs + size, 128 do
  2649. if str then
  2650. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16],
  2651. W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] =
  2652. string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2653. end
  2654. local v0_lo, v1_lo, v2_lo, v3_lo, v4_lo, v5_lo, v6_lo, v7_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2655. local v0_hi, v1_hi, v2_hi, v3_hi, v4_hi, v5_hi, v6_hi, v7_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2656. local v8_lo, v9_lo, vA_lo, vB_lo, vC_lo, vD_lo, vE_lo, vF_lo = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  2657. local v8_hi, v9_hi, vA_hi, vB_hi, vC_hi, vD_hi, vE_hi, vF_hi = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  2658. bytes_compressed = bytes_compressed + (last_block_size or 128)
  2659. local t0_lo = bytes_compressed % 2^32
  2660. local t0_hi = (bytes_compressed - t0_lo) / 2^32
  2661. t0_lo = (t0_lo + 2^31) % 2^32 - 2^31 -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while XORing
  2662. vC_lo = vC_lo ~ t0_lo -- t0 = low_8_bytes(bytes_compressed)
  2663. vC_hi = vC_hi ~ t0_hi
  2664. -- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
  2665. if last_block_size then -- flag f0
  2666. vE_lo = ~vE_lo
  2667. vE_hi = ~vE_hi
  2668. end
  2669. if is_last_node then -- flag f1
  2670. vF_lo = ~vF_lo
  2671. vF_hi = ~vF_hi
  2672. end
  2673. for j = 1, 12 do
  2674. local row = sigma[j]
  2675. local k = row[1] * 2
  2676. v0_lo = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  2677. v0_hi = v0_hi + v4_hi + floor(v0_lo / 2^32) + W[k]
  2678. v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2679. vC_lo, vC_hi = vC_hi ~ v0_hi, vC_lo ~ v0_lo
  2680. v8_lo = v8_lo % 2^32 + vC_lo % 2^32
  2681. v8_hi = v8_hi + vC_hi + floor(v8_lo / 2^32)
  2682. v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2683. v4_lo, v4_hi = v4_lo ~ v8_lo, v4_hi ~ v8_hi
  2684. v4_lo, v4_hi = v4_lo >> 24 | v4_hi << 8, v4_hi >> 24 | v4_lo << 8
  2685. k = row[2] * 2
  2686. v0_lo = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  2687. v0_hi = v0_hi + v4_hi + floor(v0_lo / 2^32) + W[k]
  2688. v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2689. vC_lo, vC_hi = vC_lo ~ v0_lo, vC_hi ~ v0_hi
  2690. vC_lo, vC_hi = vC_lo >> 16 | vC_hi << 16, vC_hi >> 16 | vC_lo << 16
  2691. v8_lo = v8_lo % 2^32 + vC_lo % 2^32
  2692. v8_hi = v8_hi + vC_hi + floor(v8_lo / 2^32)
  2693. v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2694. v4_lo, v4_hi = v4_lo ~ v8_lo, v4_hi ~ v8_hi
  2695. v4_lo, v4_hi = v4_lo << 1 | v4_hi >> 31, v4_hi << 1 | v4_lo >> 31
  2696. k = row[3] * 2
  2697. v1_lo = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2698. v1_hi = v1_hi + v5_hi + floor(v1_lo / 2^32) + W[k]
  2699. v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2700. vD_lo, vD_hi = vD_hi ~ v1_hi, vD_lo ~ v1_lo
  2701. v9_lo = v9_lo % 2^32 + vD_lo % 2^32
  2702. v9_hi = v9_hi + vD_hi + floor(v9_lo / 2^32)
  2703. v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  2704. v5_lo, v5_hi = v5_lo ~ v9_lo, v5_hi ~ v9_hi
  2705. v5_lo, v5_hi = v5_lo >> 24 | v5_hi << 8, v5_hi >> 24 | v5_lo << 8
  2706. k = row[4] * 2
  2707. v1_lo = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2708. v1_hi = v1_hi + v5_hi + floor(v1_lo / 2^32) + W[k]
  2709. v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2710. vD_lo, vD_hi = vD_lo ~ v1_lo, vD_hi ~ v1_hi
  2711. vD_lo, vD_hi = vD_lo >> 16 | vD_hi << 16, vD_hi >> 16 | vD_lo << 16
  2712. v9_lo = v9_lo % 2^32 + vD_lo % 2^32
  2713. v9_hi = v9_hi + vD_hi + floor(v9_lo / 2^32)
  2714. v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  2715. v5_lo, v5_hi = v5_lo ~ v9_lo, v5_hi ~ v9_hi
  2716. v5_lo, v5_hi = v5_lo << 1 | v5_hi >> 31, v5_hi << 1 | v5_lo >> 31
  2717. k = row[5] * 2
  2718. v2_lo = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2719. v2_hi = v2_hi + v6_hi + floor(v2_lo / 2^32) + W[k]
  2720. v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2721. vE_lo, vE_hi = vE_hi ~ v2_hi, vE_lo ~ v2_lo
  2722. vA_lo = vA_lo % 2^32 + vE_lo % 2^32
  2723. vA_hi = vA_hi + vE_hi + floor(vA_lo / 2^32)
  2724. vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2725. v6_lo, v6_hi = v6_lo ~ vA_lo, v6_hi ~ vA_hi
  2726. v6_lo, v6_hi = v6_lo >> 24 | v6_hi << 8, v6_hi >> 24 | v6_lo << 8
  2727. k = row[6] * 2
  2728. v2_lo = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2729. v2_hi = v2_hi + v6_hi + floor(v2_lo / 2^32) + W[k]
  2730. v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2731. vE_lo, vE_hi = vE_lo ~ v2_lo, vE_hi ~ v2_hi
  2732. vE_lo, vE_hi = vE_lo >> 16 | vE_hi << 16, vE_hi >> 16 | vE_lo << 16
  2733. vA_lo = vA_lo % 2^32 + vE_lo % 2^32
  2734. vA_hi = vA_hi + vE_hi + floor(vA_lo / 2^32)
  2735. vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2736. v6_lo, v6_hi = v6_lo ~ vA_lo, v6_hi ~ vA_hi
  2737. v6_lo, v6_hi = v6_lo << 1 | v6_hi >> 31, v6_hi << 1 | v6_lo >> 31
  2738. k = row[7] * 2
  2739. v3_lo = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2740. v3_hi = v3_hi + v7_hi + floor(v3_lo / 2^32) + W[k]
  2741. v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  2742. vF_lo, vF_hi = vF_hi ~ v3_hi, vF_lo ~ v3_lo
  2743. vB_lo = vB_lo % 2^32 + vF_lo % 2^32
  2744. vB_hi = vB_hi + vF_hi + floor(vB_lo / 2^32)
  2745. vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2746. v7_lo, v7_hi = v7_lo ~ vB_lo, v7_hi ~ vB_hi
  2747. v7_lo, v7_hi = v7_lo >> 24 | v7_hi << 8, v7_hi >> 24 | v7_lo << 8
  2748. k = row[8] * 2
  2749. v3_lo = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2750. v3_hi = v3_hi + v7_hi + floor(v3_lo / 2^32) + W[k]
  2751. v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  2752. vF_lo, vF_hi = vF_lo ~ v3_lo, vF_hi ~ v3_hi
  2753. vF_lo, vF_hi = vF_lo >> 16 | vF_hi << 16, vF_hi >> 16 | vF_lo << 16
  2754. vB_lo = vB_lo % 2^32 + vF_lo % 2^32
  2755. vB_hi = vB_hi + vF_hi + floor(vB_lo / 2^32)
  2756. vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2757. v7_lo, v7_hi = v7_lo ~ vB_lo, v7_hi ~ vB_hi
  2758. v7_lo, v7_hi = v7_lo << 1 | v7_hi >> 31, v7_hi << 1 | v7_lo >> 31
  2759. k = row[9] * 2
  2760. v0_lo = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2761. v0_hi = v0_hi + v5_hi + floor(v0_lo / 2^32) + W[k]
  2762. v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2763. vF_lo, vF_hi = vF_hi ~ v0_hi, vF_lo ~ v0_lo
  2764. vA_lo = vA_lo % 2^32 + vF_lo % 2^32
  2765. vA_hi = vA_hi + vF_hi + floor(vA_lo / 2^32)
  2766. vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2767. v5_lo, v5_hi = v5_lo ~ vA_lo, v5_hi ~ vA_hi
  2768. v5_lo, v5_hi = v5_lo >> 24 | v5_hi << 8, v5_hi >> 24 | v5_lo << 8
  2769. k = row[10] * 2
  2770. v0_lo = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2771. v0_hi = v0_hi + v5_hi + floor(v0_lo / 2^32) + W[k]
  2772. v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2773. vF_lo, vF_hi = vF_lo ~ v0_lo, vF_hi ~ v0_hi
  2774. vF_lo, vF_hi = vF_lo >> 16 | vF_hi << 16, vF_hi >> 16 | vF_lo << 16
  2775. vA_lo = vA_lo % 2^32 + vF_lo % 2^32
  2776. vA_hi = vA_hi + vF_hi + floor(vA_lo / 2^32)
  2777. vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2778. v5_lo, v5_hi = v5_lo ~ vA_lo, v5_hi ~ vA_hi
  2779. v5_lo, v5_hi = v5_lo << 1 | v5_hi >> 31, v5_hi << 1 | v5_lo >> 31
  2780. k = row[11] * 2
  2781. v1_lo = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2782. v1_hi = v1_hi + v6_hi + floor(v1_lo / 2^32) + W[k]
  2783. v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2784. vC_lo, vC_hi = vC_hi ~ v1_hi, vC_lo ~ v1_lo
  2785. vB_lo = vB_lo % 2^32 + vC_lo % 2^32
  2786. vB_hi = vB_hi + vC_hi + floor(vB_lo / 2^32)
  2787. vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2788. v6_lo, v6_hi = v6_lo ~ vB_lo, v6_hi ~ vB_hi
  2789. v6_lo, v6_hi = v6_lo >> 24 | v6_hi << 8, v6_hi >> 24 | v6_lo << 8
  2790. k = row[12] * 2
  2791. v1_lo = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2792. v1_hi = v1_hi + v6_hi + floor(v1_lo / 2^32) + W[k]
  2793. v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2794. vC_lo, vC_hi = vC_lo ~ v1_lo, vC_hi ~ v1_hi
  2795. vC_lo, vC_hi = vC_lo >> 16 | vC_hi << 16, vC_hi >> 16 | vC_lo << 16
  2796. vB_lo = vB_lo % 2^32 + vC_lo % 2^32
  2797. vB_hi = vB_hi + vC_hi + floor(vB_lo / 2^32)
  2798. vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2799. v6_lo, v6_hi = v6_lo ~ vB_lo, v6_hi ~ vB_hi
  2800. v6_lo, v6_hi = v6_lo << 1 | v6_hi >> 31, v6_hi << 1 | v6_lo >> 31
  2801. k = row[13] * 2
  2802. v2_lo = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2803. v2_hi = v2_hi + v7_hi + floor(v2_lo / 2^32) + W[k]
  2804. v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2805. vD_lo, vD_hi = vD_hi ~ v2_hi, vD_lo ~ v2_lo
  2806. v8_lo = v8_lo % 2^32 + vD_lo % 2^32
  2807. v8_hi = v8_hi + vD_hi + floor(v8_lo / 2^32)
  2808. v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2809. v7_lo, v7_hi = v7_lo ~ v8_lo, v7_hi ~ v8_hi
  2810. v7_lo, v7_hi = v7_lo >> 24 | v7_hi << 8, v7_hi >> 24 | v7_lo << 8
  2811. k = row[14] * 2
  2812. v2_lo = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2813. v2_hi = v2_hi + v7_hi + floor(v2_lo / 2^32) + W[k]
  2814. v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2815. vD_lo, vD_hi = vD_lo ~ v2_lo, vD_hi ~ v2_hi
  2816. vD_lo, vD_hi = vD_lo >> 16 | vD_hi << 16, vD_hi >> 16 | vD_lo << 16
  2817. v8_lo = v8_lo % 2^32 + vD_lo % 2^32
  2818. v8_hi = v8_hi + vD_hi + floor(v8_lo / 2^32)
  2819. v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2820. v7_lo, v7_hi = v7_lo ~ v8_lo, v7_hi ~ v8_hi
  2821. v7_lo, v7_hi = v7_lo << 1 | v7_hi >> 31, v7_hi << 1 | v7_lo >> 31
  2822. k = row[15] * 2
  2823. v3_lo = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  2824. v3_hi = v3_hi + v4_hi + floor(v3_lo / 2^32) + W[k]
  2825. v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  2826. vE_lo, vE_hi = vE_hi ~ v3_hi, vE_lo ~ v3_lo
  2827. v9_lo = v9_lo % 2^32 + vE_lo % 2^32
  2828. v9_hi = v9_hi + vE_hi + floor(v9_lo / 2^32)
  2829. v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  2830. v4_lo, v4_hi = v4_lo ~ v9_lo, v4_hi ~ v9_hi
  2831. v4_lo, v4_hi = v4_lo >> 24 | v4_hi << 8, v4_hi >> 24 | v4_lo << 8
  2832. k = row[16] * 2
  2833. v3_lo = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  2834. v3_hi = v3_hi + v4_hi + floor(v3_lo / 2^32) + W[k]
  2835. v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  2836. vE_lo, vE_hi = vE_lo ~ v3_lo, vE_hi ~ v3_hi
  2837. vE_lo, vE_hi = vE_lo >> 16 | vE_hi << 16, vE_hi >> 16 | vE_lo << 16
  2838. v9_lo = v9_lo % 2^32 + vE_lo % 2^32
  2839. v9_hi = v9_hi + vE_hi + floor(v9_lo / 2^32)
  2840. v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  2841. v4_lo, v4_hi = v4_lo ~ v9_lo, v4_hi ~ v9_hi
  2842. v4_lo, v4_hi = v4_lo << 1 | v4_hi >> 31, v4_hi << 1 | v4_lo >> 31
  2843. end
  2844. h1_lo = h1_lo ~ v0_lo ~ v8_lo
  2845. h2_lo = h2_lo ~ v1_lo ~ v9_lo
  2846. h3_lo = h3_lo ~ v2_lo ~ vA_lo
  2847. h4_lo = h4_lo ~ v3_lo ~ vB_lo
  2848. h5_lo = h5_lo ~ v4_lo ~ vC_lo
  2849. h6_lo = h6_lo ~ v5_lo ~ vD_lo
  2850. h7_lo = h7_lo ~ v6_lo ~ vE_lo
  2851. h8_lo = h8_lo ~ v7_lo ~ vF_lo
  2852. h1_hi = h1_hi ~ v0_hi ~ v8_hi
  2853. h2_hi = h2_hi ~ v1_hi ~ v9_hi
  2854. h3_hi = h3_hi ~ v2_hi ~ vA_hi
  2855. h4_hi = h4_hi ~ v3_hi ~ vB_hi
  2856. h5_hi = h5_hi ~ v4_hi ~ vC_hi
  2857. h6_hi = h6_hi ~ v5_hi ~ vD_hi
  2858. h7_hi = h7_hi ~ v6_hi ~ vE_hi
  2859. h8_hi = h8_hi ~ v7_hi ~ vF_hi
  2860. end
  2861. H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2862. H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2863. return bytes_compressed
  2864. end
  2865. local function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  2866. -- offs >= 0, size >= 0, size is multiple of 64
  2867. block_length = block_length or 64
  2868. local W = common_W
  2869. local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
  2870. H_out = H_out or H_in
  2871. for pos = offs + 1, offs + size, 64 do
  2872. if str then
  2873. W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2874. string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2875. end
  2876. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  2877. local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
  2878. local t0 = chunk_index % 2^32 -- t0 = low_4_bytes(chunk_index)
  2879. local t1 = (chunk_index - t0) / 2^32 -- t1 = high_4_bytes(chunk_index)
  2880. t0 = (t0 + 2^31) % 2^32 - 2^31 -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while ORing
  2881. local vC, vD, vE, vF = 0|t0, 0|t1, block_length, flags
  2882. for j = 1, 7 do
  2883. v0 = v0 + v4 + W[perm_blake3[j]]
  2884. vC = vC ~ v0
  2885. vC = vC >> 16 | vC << 16
  2886. v8 = v8 + vC
  2887. v4 = v4 ~ v8
  2888. v4 = v4 >> 12 | v4 << 20
  2889. v0 = v0 + v4 + W[perm_blake3[j + 14]]
  2890. vC = vC ~ v0
  2891. vC = vC >> 8 | vC << 24
  2892. v8 = v8 + vC
  2893. v4 = v4 ~ v8
  2894. v4 = v4 >> 7 | v4 << 25
  2895. v1 = v1 + v5 + W[perm_blake3[j + 1]]
  2896. vD = vD ~ v1
  2897. vD = vD >> 16 | vD << 16
  2898. v9 = v9 + vD
  2899. v5 = v5 ~ v9
  2900. v5 = v5 >> 12 | v5 << 20
  2901. v1 = v1 + v5 + W[perm_blake3[j + 2]]
  2902. vD = vD ~ v1
  2903. vD = vD >> 8 | vD << 24
  2904. v9 = v9 + vD
  2905. v5 = v5 ~ v9
  2906. v5 = v5 >> 7 | v5 << 25
  2907. v2 = v2 + v6 + W[perm_blake3[j + 16]]
  2908. vE = vE ~ v2
  2909. vE = vE >> 16 | vE << 16
  2910. vA = vA + vE
  2911. v6 = v6 ~ vA
  2912. v6 = v6 >> 12 | v6 << 20
  2913. v2 = v2 + v6 + W[perm_blake3[j + 7]]
  2914. vE = vE ~ v2
  2915. vE = vE >> 8 | vE << 24
  2916. vA = vA + vE
  2917. v6 = v6 ~ vA
  2918. v6 = v6 >> 7 | v6 << 25
  2919. v3 = v3 + v7 + W[perm_blake3[j + 15]]
  2920. vF = vF ~ v3
  2921. vF = vF >> 16 | vF << 16
  2922. vB = vB + vF
  2923. v7 = v7 ~ vB
  2924. v7 = v7 >> 12 | v7 << 20
  2925. v3 = v3 + v7 + W[perm_blake3[j + 17]]
  2926. vF = vF ~ v3
  2927. vF = vF >> 8 | vF << 24
  2928. vB = vB + vF
  2929. v7 = v7 ~ vB
  2930. v7 = v7 >> 7 | v7 << 25
  2931. v0 = v0 + v5 + W[perm_blake3[j + 21]]
  2932. vF = vF ~ v0
  2933. vF = vF >> 16 | vF << 16
  2934. vA = vA + vF
  2935. v5 = v5 ~ vA
  2936. v5 = v5 >> 12 | v5 << 20
  2937. v0 = v0 + v5 + W[perm_blake3[j + 5]]
  2938. vF = vF ~ v0
  2939. vF = vF >> 8 | vF << 24
  2940. vA = vA + vF
  2941. v5 = v5 ~ vA
  2942. v5 = v5 >> 7 | v5 << 25
  2943. v1 = v1 + v6 + W[perm_blake3[j + 3]]
  2944. vC = vC ~ v1
  2945. vC = vC >> 16 | vC << 16
  2946. vB = vB + vC
  2947. v6 = v6 ~ vB
  2948. v6 = v6 >> 12 | v6 << 20
  2949. v1 = v1 + v6 + W[perm_blake3[j + 6]]
  2950. vC = vC ~ v1
  2951. vC = vC >> 8 | vC << 24
  2952. vB = vB + vC
  2953. v6 = v6 ~ vB
  2954. v6 = v6 >> 7 | v6 << 25
  2955. v2 = v2 + v7 + W[perm_blake3[j + 4]]
  2956. vD = vD ~ v2
  2957. vD = vD >> 16 | vD << 16
  2958. v8 = v8 + vD
  2959. v7 = v7 ~ v8
  2960. v7 = v7 >> 12 | v7 << 20
  2961. v2 = v2 + v7 + W[perm_blake3[j + 18]]
  2962. vD = vD ~ v2
  2963. vD = vD >> 8 | vD << 24
  2964. v8 = v8 + vD
  2965. v7 = v7 ~ v8
  2966. v7 = v7 >> 7 | v7 << 25
  2967. v3 = v3 + v4 + W[perm_blake3[j + 19]]
  2968. vE = vE ~ v3
  2969. vE = vE >> 16 | vE << 16
  2970. v9 = v9 + vE
  2971. v4 = v4 ~ v9
  2972. v4 = v4 >> 12 | v4 << 20
  2973. v3 = v3 + v4 + W[perm_blake3[j + 20]]
  2974. vE = vE ~ v3
  2975. vE = vE >> 8 | vE << 24
  2976. v9 = v9 + vE
  2977. v4 = v4 ~ v9
  2978. v4 = v4 >> 7 | v4 << 25
  2979. end
  2980. if wide_output then
  2981. H_out[ 9] = h1 ~ v8
  2982. H_out[10] = h2 ~ v9
  2983. H_out[11] = h3 ~ vA
  2984. H_out[12] = h4 ~ vB
  2985. H_out[13] = h5 ~ vC
  2986. H_out[14] = h6 ~ vD
  2987. H_out[15] = h7 ~ vE
  2988. H_out[16] = h8 ~ vF
  2989. end
  2990. h1 = v0 ~ v8
  2991. h2 = v1 ~ v9
  2992. h3 = v2 ~ vA
  2993. h4 = v3 ~ vB
  2994. h5 = v4 ~ vC
  2995. h6 = v5 ~ vD
  2996. h7 = v6 ~ vE
  2997. h8 = v7 ~ vF
  2998. end
  2999. H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3000. end
  3001. return XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
  3002. ]=](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3)
  3003. end
  3004. XOR = XOR or XORA5
  3005. if branch == "LIB32" or branch == "EMUL" then
  3006. -- implementation for Lua 5.1/5.2 (with or without bitwise library available)
  3007. function sha256_feed_64(H, str, offs, size)
  3008. -- offs >= 0, size >= 0, size is multiple of 64
  3009. local W, K = common_W, sha2_K_hi
  3010. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  3011. for pos = offs, offs + size - 1, 64 do
  3012. for j = 1, 16 do
  3013. pos = pos + 4
  3014. local a, b, c, d = byte(str, pos - 3, pos)
  3015. W[j] = ((a * 256 + b) * 256 + c) * 256 + d
  3016. end
  3017. for j = 17, 64 do
  3018. local a, b = W[j-15], W[j-2]
  3019. local a7, a18, b17, b19 = a / 2^7, a / 2^18, b / 2^17, b / 2^19
  3020. W[j] = (XOR(a7 % 1 * (2^32 - 1) + a7, a18 % 1 * (2^32 - 1) + a18, (a - a % 2^3) / 2^3) + W[j-16] + W[j-7]
  3021. + XOR(b17 % 1 * (2^32 - 1) + b17, b19 % 1 * (2^32 - 1) + b19, (b - b % 2^10) / 2^10)) % 2^32
  3022. end
  3023. local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  3024. for j = 1, 64 do
  3025. e = e % 2^32
  3026. local e6, e11, e7 = e / 2^6, e / 2^11, e * 2^7
  3027. local e7_lo = e7 % 2^32
  3028. local z = AND(e, f) + AND(-1-e, g) + h + K[j] + W[j]
  3029. + XOR(e6 % 1 * (2^32 - 1) + e6, e11 % 1 * (2^32 - 1) + e11, e7_lo + (e7 - e7_lo) / 2^32)
  3030. h = g
  3031. g = f
  3032. f = e
  3033. e = z + d
  3034. d = c
  3035. c = b
  3036. b = a % 2^32
  3037. local b2, b13, b10 = b / 2^2, b / 2^13, b * 2^10
  3038. local b10_lo = b10 % 2^32
  3039. a = z + AND(d, c) + AND(b, XOR(d, c)) +
  3040. XOR(b2 % 1 * (2^32 - 1) + b2, b13 % 1 * (2^32 - 1) + b13, b10_lo + (b10 - b10_lo) / 2^32)
  3041. end
  3042. h1, h2, h3, h4 = (a + h1) % 2^32, (b + h2) % 2^32, (c + h3) % 2^32, (d + h4) % 2^32
  3043. h5, h6, h7, h8 = (e + h5) % 2^32, (f + h6) % 2^32, (g + h7) % 2^32, (h + h8) % 2^32
  3044. end
  3045. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3046. end
  3047. function sha512_feed_128(H_lo, H_hi, str, offs, size)
  3048. -- offs >= 0, size >= 0, size is multiple of 128
  3049. -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  3050. local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
  3051. local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  3052. local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  3053. for pos = offs, offs + size - 1, 128 do
  3054. for j = 1, 16*2 do
  3055. pos = pos + 4
  3056. local a, b, c, d = byte(str, pos - 3, pos)
  3057. W[j] = ((a * 256 + b) * 256 + c) * 256 + d
  3058. end
  3059. for jj = 17*2, 80*2, 2 do
  3060. local a_hi, a_lo, b_hi, b_lo = W[jj-31], W[jj-30], W[jj-5], W[jj-4]
  3061. local b_hi_6, b_hi_19, b_hi_29, b_lo_19, b_lo_29, a_hi_1, a_hi_7, a_hi_8, a_lo_1, a_lo_8 =
  3062. b_hi % 2^6, b_hi % 2^19, b_hi % 2^29, b_lo % 2^19, b_lo % 2^29, a_hi % 2^1, a_hi % 2^7, a_hi % 2^8, a_lo % 2^1, a_lo % 2^8
  3063. local tmp1 = XOR((a_lo - a_lo_1) / 2^1 + a_hi_1 * 2^31, (a_lo - a_lo_8) / 2^8 + a_hi_8 * 2^24, (a_lo - a_lo % 2^7) / 2^7 + a_hi_7 * 2^25) % 2^32
  3064. + XOR((b_lo - b_lo_19) / 2^19 + b_hi_19 * 2^13, b_lo_29 * 2^3 + (b_hi - b_hi_29) / 2^29, (b_lo - b_lo % 2^6) / 2^6 + b_hi_6 * 2^26) % 2^32
  3065. + W[jj-14] + W[jj-32]
  3066. local tmp2 = tmp1 % 2^32
  3067. W[jj-1] = (XOR((a_hi - a_hi_1) / 2^1 + a_lo_1 * 2^31, (a_hi - a_hi_8) / 2^8 + a_lo_8 * 2^24, (a_hi - a_hi_7) / 2^7)
  3068. + XOR((b_hi - b_hi_19) / 2^19 + b_lo_19 * 2^13, b_hi_29 * 2^3 + (b_lo - b_lo_29) / 2^29, (b_hi - b_hi_6) / 2^6)
  3069. + W[jj-15] + W[jj-33] + (tmp1 - tmp2) / 2^32) % 2^32
  3070. W[jj] = tmp2
  3071. end
  3072. local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3073. local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3074. for j = 1, 80 do
  3075. local jj = 2*j
  3076. local e_lo_9, e_lo_14, e_lo_18, e_hi_9, e_hi_14, e_hi_18 = e_lo % 2^9, e_lo % 2^14, e_lo % 2^18, e_hi % 2^9, e_hi % 2^14, e_hi % 2^18
  3077. local tmp1 = (AND(e_lo, f_lo) + AND(-1-e_lo, g_lo)) % 2^32 + h_lo + K_lo[j] + W[jj]
  3078. + XOR((e_lo - e_lo_14) / 2^14 + e_hi_14 * 2^18, (e_lo - e_lo_18) / 2^18 + e_hi_18 * 2^14, e_lo_9 * 2^23 + (e_hi - e_hi_9) / 2^9) % 2^32
  3079. local z_lo = tmp1 % 2^32
  3080. local z_hi = AND(e_hi, f_hi) + AND(-1-e_hi, g_hi) + h_hi + K_hi[j] + W[jj-1] + (tmp1 - z_lo) / 2^32
  3081. + XOR((e_hi - e_hi_14) / 2^14 + e_lo_14 * 2^18, (e_hi - e_hi_18) / 2^18 + e_lo_18 * 2^14, e_hi_9 * 2^23 + (e_lo - e_lo_9) / 2^9)
  3082. h_lo = g_lo; h_hi = g_hi
  3083. g_lo = f_lo; g_hi = f_hi
  3084. f_lo = e_lo; f_hi = e_hi
  3085. tmp1 = z_lo + d_lo
  3086. e_lo = tmp1 % 2^32
  3087. e_hi = (z_hi + d_hi + (tmp1 - e_lo) / 2^32) % 2^32
  3088. d_lo = c_lo; d_hi = c_hi
  3089. c_lo = b_lo; c_hi = b_hi
  3090. b_lo = a_lo; b_hi = a_hi
  3091. local b_lo_2, b_lo_7, b_lo_28, b_hi_2, b_hi_7, b_hi_28 = b_lo % 2^2, b_lo % 2^7, b_lo % 2^28, b_hi % 2^2, b_hi % 2^7, b_hi % 2^28
  3092. tmp1 = z_lo + (AND(d_lo, c_lo) + AND(b_lo, XOR(d_lo, c_lo))) % 2^32
  3093. + XOR((b_lo - b_lo_28) / 2^28 + b_hi_28 * 2^4, b_lo_2 * 2^30 + (b_hi - b_hi_2) / 2^2, b_lo_7 * 2^25 + (b_hi - b_hi_7) / 2^7) % 2^32
  3094. a_lo = tmp1 % 2^32
  3095. a_hi = (z_hi + AND(d_hi, c_hi) + AND(b_hi, XOR(d_hi, c_hi)) + (tmp1 - a_lo) / 2^32
  3096. + XOR((b_hi - b_hi_28) / 2^28 + b_lo_28 * 2^4, b_hi_2 * 2^30 + (b_lo - b_lo_2) / 2^2, b_hi_7 * 2^25 + (b_lo - b_lo_7) / 2^7)) % 2^32
  3097. end
  3098. a_lo = h1_lo + a_lo
  3099. h1_lo = a_lo % 2^32
  3100. h1_hi = (h1_hi + a_hi + (a_lo - h1_lo) / 2^32) % 2^32
  3101. a_lo = h2_lo + b_lo
  3102. h2_lo = a_lo % 2^32
  3103. h2_hi = (h2_hi + b_hi + (a_lo - h2_lo) / 2^32) % 2^32
  3104. a_lo = h3_lo + c_lo
  3105. h3_lo = a_lo % 2^32
  3106. h3_hi = (h3_hi + c_hi + (a_lo - h3_lo) / 2^32) % 2^32
  3107. a_lo = h4_lo + d_lo
  3108. h4_lo = a_lo % 2^32
  3109. h4_hi = (h4_hi + d_hi + (a_lo - h4_lo) / 2^32) % 2^32
  3110. a_lo = h5_lo + e_lo
  3111. h5_lo = a_lo % 2^32
  3112. h5_hi = (h5_hi + e_hi + (a_lo - h5_lo) / 2^32) % 2^32
  3113. a_lo = h6_lo + f_lo
  3114. h6_lo = a_lo % 2^32
  3115. h6_hi = (h6_hi + f_hi + (a_lo - h6_lo) / 2^32) % 2^32
  3116. a_lo = h7_lo + g_lo
  3117. h7_lo = a_lo % 2^32
  3118. h7_hi = (h7_hi + g_hi + (a_lo - h7_lo) / 2^32) % 2^32
  3119. a_lo = h8_lo + h_lo
  3120. h8_lo = a_lo % 2^32
  3121. h8_hi = (h8_hi + h_hi + (a_lo - h8_lo) / 2^32) % 2^32
  3122. end
  3123. H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3124. H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3125. end
  3126. if branch == "LIB32" then
  3127. function md5_feed_64(H, str, offs, size)
  3128. -- offs >= 0, size >= 0, size is multiple of 64
  3129. local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  3130. local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  3131. for pos = offs, offs + size - 1, 64 do
  3132. for j = 1, 16 do
  3133. pos = pos + 4
  3134. local a, b, c, d = byte(str, pos - 3, pos)
  3135. W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3136. end
  3137. local a, b, c, d = h1, h2, h3, h4
  3138. local s = 25
  3139. for j = 1, 16 do
  3140. local F = ROR(AND(b, c) + AND(-1-b, d) + a + K[j] + W[j], s) + b
  3141. s = md5_next_shift[s]
  3142. a = d
  3143. d = c
  3144. c = b
  3145. b = F
  3146. end
  3147. s = 27
  3148. for j = 17, 32 do
  3149. local F = ROR(AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1], s) + b
  3150. s = md5_next_shift[s]
  3151. a = d
  3152. d = c
  3153. c = b
  3154. b = F
  3155. end
  3156. s = 28
  3157. for j = 33, 48 do
  3158. local F = ROR(XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1], s) + b
  3159. s = md5_next_shift[s]
  3160. a = d
  3161. d = c
  3162. c = b
  3163. b = F
  3164. end
  3165. s = 26
  3166. for j = 49, 64 do
  3167. local F = ROR(XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1], s) + b
  3168. s = md5_next_shift[s]
  3169. a = d
  3170. d = c
  3171. c = b
  3172. b = F
  3173. end
  3174. h1 = (a + h1) % 2^32
  3175. h2 = (b + h2) % 2^32
  3176. h3 = (c + h3) % 2^32
  3177. h4 = (d + h4) % 2^32
  3178. end
  3179. H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  3180. end
  3181. elseif branch == "EMUL" then
  3182. function md5_feed_64(H, str, offs, size)
  3183. -- offs >= 0, size >= 0, size is multiple of 64
  3184. local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  3185. local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  3186. for pos = offs, offs + size - 1, 64 do
  3187. for j = 1, 16 do
  3188. pos = pos + 4
  3189. local a, b, c, d = byte(str, pos - 3, pos)
  3190. W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3191. end
  3192. local a, b, c, d = h1, h2, h3, h4
  3193. local s = 25
  3194. for j = 1, 16 do
  3195. local z = (AND(b, c) + AND(-1-b, d) + a + K[j] + W[j]) % 2^32 / 2^s
  3196. local y = z % 1
  3197. s = md5_next_shift[s]
  3198. a = d
  3199. d = c
  3200. c = b
  3201. b = y * 2^32 + (z - y) + b
  3202. end
  3203. s = 27
  3204. for j = 17, 32 do
  3205. local z = (AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1]) % 2^32 / 2^s
  3206. local y = z % 1
  3207. s = md5_next_shift[s]
  3208. a = d
  3209. d = c
  3210. c = b
  3211. b = y * 2^32 + (z - y) + b
  3212. end
  3213. s = 28
  3214. for j = 33, 48 do
  3215. local z = (XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1]) % 2^32 / 2^s
  3216. local y = z % 1
  3217. s = md5_next_shift[s]
  3218. a = d
  3219. d = c
  3220. c = b
  3221. b = y * 2^32 + (z - y) + b
  3222. end
  3223. s = 26
  3224. for j = 49, 64 do
  3225. local z = (XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1]) % 2^32 / 2^s
  3226. local y = z % 1
  3227. s = md5_next_shift[s]
  3228. a = d
  3229. d = c
  3230. c = b
  3231. b = y * 2^32 + (z - y) + b
  3232. end
  3233. h1 = (a + h1) % 2^32
  3234. h2 = (b + h2) % 2^32
  3235. h3 = (c + h3) % 2^32
  3236. h4 = (d + h4) % 2^32
  3237. end
  3238. H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  3239. end
  3240. end
  3241. function sha1_feed_64(H, str, offs, size)
  3242. -- offs >= 0, size >= 0, size is multiple of 64
  3243. local W = common_W
  3244. local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
  3245. for pos = offs, offs + size - 1, 64 do
  3246. for j = 1, 16 do
  3247. pos = pos + 4
  3248. local a, b, c, d = byte(str, pos - 3, pos)
  3249. W[j] = ((a * 256 + b) * 256 + c) * 256 + d
  3250. end
  3251. for j = 17, 80 do
  3252. local a = XOR(W[j-3], W[j-8], W[j-14], W[j-16]) % 2^32 * 2
  3253. local b = a % 2^32
  3254. W[j] = b + (a - b) / 2^32
  3255. end
  3256. local a, b, c, d, e = h1, h2, h3, h4, h5
  3257. for j = 1, 20 do
  3258. local a5 = a * 2^5
  3259. local z = a5 % 2^32
  3260. z = z + (a5 - z) / 2^32 + AND(b, c) + AND(-1-b, d) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2))
  3261. e = d
  3262. d = c
  3263. c = b / 2^2
  3264. c = c % 1 * (2^32 - 1) + c
  3265. b = a
  3266. a = z % 2^32
  3267. end
  3268. for j = 21, 40 do
  3269. local a5 = a * 2^5
  3270. local z = a5 % 2^32
  3271. z = z + (a5 - z) / 2^32 + XOR(b, c, d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3)
  3272. e = d
  3273. d = c
  3274. c = b / 2^2
  3275. c = c % 1 * (2^32 - 1) + c
  3276. b = a
  3277. a = z % 2^32
  3278. end
  3279. for j = 41, 60 do
  3280. local a5 = a * 2^5
  3281. local z = a5 % 2^32
  3282. z = z + (a5 - z) / 2^32 + AND(d, c) + AND(b, XOR(d, c)) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5)
  3283. e = d
  3284. d = c
  3285. c = b / 2^2
  3286. c = c % 1 * (2^32 - 1) + c
  3287. b = a
  3288. a = z % 2^32
  3289. end
  3290. for j = 61, 80 do
  3291. local a5 = a * 2^5
  3292. local z = a5 % 2^32
  3293. z = z + (a5 - z) / 2^32 + XOR(b, c, d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10)
  3294. e = d
  3295. d = c
  3296. c = b / 2^2
  3297. c = c % 1 * (2^32 - 1) + c
  3298. b = a
  3299. a = z % 2^32
  3300. end
  3301. h1 = (a + h1) % 2^32
  3302. h2 = (b + h2) % 2^32
  3303. h3 = (c + h3) % 2^32
  3304. h4 = (d + h4) % 2^32
  3305. h5 = (e + h5) % 2^32
  3306. end
  3307. H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
  3308. end
  3309. function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
  3310. -- This is an example of a Lua function having 79 local variables :-)
  3311. -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  3312. local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
  3313. local qwords_qty = block_size_in_bytes / 8
  3314. for pos = offs, offs + size - 1, block_size_in_bytes do
  3315. for j = 1, qwords_qty do
  3316. local a, b, c, d = byte(str, pos + 1, pos + 4)
  3317. lanes_lo[j] = XOR(lanes_lo[j], ((d * 256 + c) * 256 + b) * 256 + a)
  3318. pos = pos + 8
  3319. a, b, c, d = byte(str, pos - 3, pos)
  3320. lanes_hi[j] = XOR(lanes_hi[j], ((d * 256 + c) * 256 + b) * 256 + a)
  3321. end
  3322. local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi,
  3323. L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi,
  3324. L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi =
  3325. lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5],
  3326. lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10],
  3327. lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15],
  3328. lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20],
  3329. lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25]
  3330. for round_idx = 1, 24 do
  3331. local C1_lo = XOR(L01_lo, L06_lo, L11_lo, L16_lo, L21_lo)
  3332. local C1_hi = XOR(L01_hi, L06_hi, L11_hi, L16_hi, L21_hi)
  3333. local C2_lo = XOR(L02_lo, L07_lo, L12_lo, L17_lo, L22_lo)
  3334. local C2_hi = XOR(L02_hi, L07_hi, L12_hi, L17_hi, L22_hi)
  3335. local C3_lo = XOR(L03_lo, L08_lo, L13_lo, L18_lo, L23_lo)
  3336. local C3_hi = XOR(L03_hi, L08_hi, L13_hi, L18_hi, L23_hi)
  3337. local C4_lo = XOR(L04_lo, L09_lo, L14_lo, L19_lo, L24_lo)
  3338. local C4_hi = XOR(L04_hi, L09_hi, L14_hi, L19_hi, L24_hi)
  3339. local C5_lo = XOR(L05_lo, L10_lo, L15_lo, L20_lo, L25_lo)
  3340. local C5_hi = XOR(L05_hi, L10_hi, L15_hi, L20_hi, L25_hi)
  3341. local D_lo = XOR(C1_lo, C3_lo * 2 + (C3_hi % 2^32 - C3_hi % 2^31) / 2^31)
  3342. local D_hi = XOR(C1_hi, C3_hi * 2 + (C3_lo % 2^32 - C3_lo % 2^31) / 2^31)
  3343. local T0_lo = XOR(D_lo, L02_lo)
  3344. local T0_hi = XOR(D_hi, L02_hi)
  3345. local T1_lo = XOR(D_lo, L07_lo)
  3346. local T1_hi = XOR(D_hi, L07_hi)
  3347. local T2_lo = XOR(D_lo, L12_lo)
  3348. local T2_hi = XOR(D_hi, L12_hi)
  3349. local T3_lo = XOR(D_lo, L17_lo)
  3350. local T3_hi = XOR(D_hi, L17_hi)
  3351. local T4_lo = XOR(D_lo, L22_lo)
  3352. local T4_hi = XOR(D_hi, L22_hi)
  3353. L02_lo = (T1_lo % 2^32 - T1_lo % 2^20) / 2^20 + T1_hi * 2^12
  3354. L02_hi = (T1_hi % 2^32 - T1_hi % 2^20) / 2^20 + T1_lo * 2^12
  3355. L07_lo = (T3_lo % 2^32 - T3_lo % 2^19) / 2^19 + T3_hi * 2^13
  3356. L07_hi = (T3_hi % 2^32 - T3_hi % 2^19) / 2^19 + T3_lo * 2^13
  3357. L12_lo = T0_lo * 2 + (T0_hi % 2^32 - T0_hi % 2^31) / 2^31
  3358. L12_hi = T0_hi * 2 + (T0_lo % 2^32 - T0_lo % 2^31) / 2^31
  3359. L17_lo = T2_lo * 2^10 + (T2_hi % 2^32 - T2_hi % 2^22) / 2^22
  3360. L17_hi = T2_hi * 2^10 + (T2_lo % 2^32 - T2_lo % 2^22) / 2^22
  3361. L22_lo = T4_lo * 2^2 + (T4_hi % 2^32 - T4_hi % 2^30) / 2^30
  3362. L22_hi = T4_hi * 2^2 + (T4_lo % 2^32 - T4_lo % 2^30) / 2^30
  3363. D_lo = XOR(C2_lo, C4_lo * 2 + (C4_hi % 2^32 - C4_hi % 2^31) / 2^31)
  3364. D_hi = XOR(C2_hi, C4_hi * 2 + (C4_lo % 2^32 - C4_lo % 2^31) / 2^31)
  3365. T0_lo = XOR(D_lo, L03_lo)
  3366. T0_hi = XOR(D_hi, L03_hi)
  3367. T1_lo = XOR(D_lo, L08_lo)
  3368. T1_hi = XOR(D_hi, L08_hi)
  3369. T2_lo = XOR(D_lo, L13_lo)
  3370. T2_hi = XOR(D_hi, L13_hi)
  3371. T3_lo = XOR(D_lo, L18_lo)
  3372. T3_hi = XOR(D_hi, L18_hi)
  3373. T4_lo = XOR(D_lo, L23_lo)
  3374. T4_hi = XOR(D_hi, L23_hi)
  3375. L03_lo = (T2_lo % 2^32 - T2_lo % 2^21) / 2^21 + T2_hi * 2^11
  3376. L03_hi = (T2_hi % 2^32 - T2_hi % 2^21) / 2^21 + T2_lo * 2^11
  3377. L08_lo = (T4_lo % 2^32 - T4_lo % 2^3) / 2^3 + T4_hi * 2^29 % 2^32
  3378. L08_hi = (T4_hi % 2^32 - T4_hi % 2^3) / 2^3 + T4_lo * 2^29 % 2^32
  3379. L13_lo = T1_lo * 2^6 + (T1_hi % 2^32 - T1_hi % 2^26) / 2^26
  3380. L13_hi = T1_hi * 2^6 + (T1_lo % 2^32 - T1_lo % 2^26) / 2^26
  3381. L18_lo = T3_lo * 2^15 + (T3_hi % 2^32 - T3_hi % 2^17) / 2^17
  3382. L18_hi = T3_hi * 2^15 + (T3_lo % 2^32 - T3_lo % 2^17) / 2^17
  3383. L23_lo = (T0_lo % 2^32 - T0_lo % 2^2) / 2^2 + T0_hi * 2^30 % 2^32
  3384. L23_hi = (T0_hi % 2^32 - T0_hi % 2^2) / 2^2 + T0_lo * 2^30 % 2^32
  3385. D_lo = XOR(C3_lo, C5_lo * 2 + (C5_hi % 2^32 - C5_hi % 2^31) / 2^31)
  3386. D_hi = XOR(C3_hi, C5_hi * 2 + (C5_lo % 2^32 - C5_lo % 2^31) / 2^31)
  3387. T0_lo = XOR(D_lo, L04_lo)
  3388. T0_hi = XOR(D_hi, L04_hi)
  3389. T1_lo = XOR(D_lo, L09_lo)
  3390. T1_hi = XOR(D_hi, L09_hi)
  3391. T2_lo = XOR(D_lo, L14_lo)
  3392. T2_hi = XOR(D_hi, L14_hi)
  3393. T3_lo = XOR(D_lo, L19_lo)
  3394. T3_hi = XOR(D_hi, L19_hi)
  3395. T4_lo = XOR(D_lo, L24_lo)
  3396. T4_hi = XOR(D_hi, L24_hi)
  3397. L04_lo = T3_lo * 2^21 % 2^32 + (T3_hi % 2^32 - T3_hi % 2^11) / 2^11
  3398. L04_hi = T3_hi * 2^21 % 2^32 + (T3_lo % 2^32 - T3_lo % 2^11) / 2^11
  3399. L09_lo = T0_lo * 2^28 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^4) / 2^4
  3400. L09_hi = T0_hi * 2^28 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^4) / 2^4
  3401. L14_lo = T2_lo * 2^25 % 2^32 + (T2_hi % 2^32 - T2_hi % 2^7) / 2^7
  3402. L14_hi = T2_hi * 2^25 % 2^32 + (T2_lo % 2^32 - T2_lo % 2^7) / 2^7
  3403. L19_lo = (T4_lo % 2^32 - T4_lo % 2^8) / 2^8 + T4_hi * 2^24 % 2^32
  3404. L19_hi = (T4_hi % 2^32 - T4_hi % 2^8) / 2^8 + T4_lo * 2^24 % 2^32
  3405. L24_lo = (T1_lo % 2^32 - T1_lo % 2^9) / 2^9 + T1_hi * 2^23 % 2^32
  3406. L24_hi = (T1_hi % 2^32 - T1_hi % 2^9) / 2^9 + T1_lo * 2^23 % 2^32
  3407. D_lo = XOR(C4_lo, C1_lo * 2 + (C1_hi % 2^32 - C1_hi % 2^31) / 2^31)
  3408. D_hi = XOR(C4_hi, C1_hi * 2 + (C1_lo % 2^32 - C1_lo % 2^31) / 2^31)
  3409. T0_lo = XOR(D_lo, L05_lo)
  3410. T0_hi = XOR(D_hi, L05_hi)
  3411. T1_lo = XOR(D_lo, L10_lo)
  3412. T1_hi = XOR(D_hi, L10_hi)
  3413. T2_lo = XOR(D_lo, L15_lo)
  3414. T2_hi = XOR(D_hi, L15_hi)
  3415. T3_lo = XOR(D_lo, L20_lo)
  3416. T3_hi = XOR(D_hi, L20_hi)
  3417. T4_lo = XOR(D_lo, L25_lo)
  3418. T4_hi = XOR(D_hi, L25_hi)
  3419. L05_lo = T4_lo * 2^14 + (T4_hi % 2^32 - T4_hi % 2^18) / 2^18
  3420. L05_hi = T4_hi * 2^14 + (T4_lo % 2^32 - T4_lo % 2^18) / 2^18
  3421. L10_lo = T1_lo * 2^20 % 2^32 + (T1_hi % 2^32 - T1_hi % 2^12) / 2^12
  3422. L10_hi = T1_hi * 2^20 % 2^32 + (T1_lo % 2^32 - T1_lo % 2^12) / 2^12
  3423. L15_lo = T3_lo * 2^8 + (T3_hi % 2^32 - T3_hi % 2^24) / 2^24
  3424. L15_hi = T3_hi * 2^8 + (T3_lo % 2^32 - T3_lo % 2^24) / 2^24
  3425. L20_lo = T0_lo * 2^27 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^5) / 2^5
  3426. L20_hi = T0_hi * 2^27 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^5) / 2^5
  3427. L25_lo = (T2_lo % 2^32 - T2_lo % 2^25) / 2^25 + T2_hi * 2^7
  3428. L25_hi = (T2_hi % 2^32 - T2_hi % 2^25) / 2^25 + T2_lo * 2^7
  3429. D_lo = XOR(C5_lo, C2_lo * 2 + (C2_hi % 2^32 - C2_hi % 2^31) / 2^31)
  3430. D_hi = XOR(C5_hi, C2_hi * 2 + (C2_lo % 2^32 - C2_lo % 2^31) / 2^31)
  3431. T1_lo = XOR(D_lo, L06_lo)
  3432. T1_hi = XOR(D_hi, L06_hi)
  3433. T2_lo = XOR(D_lo, L11_lo)
  3434. T2_hi = XOR(D_hi, L11_hi)
  3435. T3_lo = XOR(D_lo, L16_lo)
  3436. T3_hi = XOR(D_hi, L16_hi)
  3437. T4_lo = XOR(D_lo, L21_lo)
  3438. T4_hi = XOR(D_hi, L21_hi)
  3439. L06_lo = T2_lo * 2^3 + (T2_hi % 2^32 - T2_hi % 2^29) / 2^29
  3440. L06_hi = T2_hi * 2^3 + (T2_lo % 2^32 - T2_lo % 2^29) / 2^29
  3441. L11_lo = T4_lo * 2^18 + (T4_hi % 2^32 - T4_hi % 2^14) / 2^14
  3442. L11_hi = T4_hi * 2^18 + (T4_lo % 2^32 - T4_lo % 2^14) / 2^14
  3443. L16_lo = (T1_lo % 2^32 - T1_lo % 2^28) / 2^28 + T1_hi * 2^4
  3444. L16_hi = (T1_hi % 2^32 - T1_hi % 2^28) / 2^28 + T1_lo * 2^4
  3445. L21_lo = (T3_lo % 2^32 - T3_lo % 2^23) / 2^23 + T3_hi * 2^9
  3446. L21_hi = (T3_hi % 2^32 - T3_hi % 2^23) / 2^23 + T3_lo * 2^9
  3447. L01_lo = XOR(D_lo, L01_lo)
  3448. L01_hi = XOR(D_hi, L01_hi)
  3449. L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = XOR(L01_lo, AND(-1-L02_lo, L03_lo)), XOR(L02_lo, AND(-1-L03_lo, L04_lo)), XOR(L03_lo, AND(-1-L04_lo, L05_lo)), XOR(L04_lo, AND(-1-L05_lo, L01_lo)), XOR(L05_lo, AND(-1-L01_lo, L02_lo))
  3450. L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = XOR(L01_hi, AND(-1-L02_hi, L03_hi)), XOR(L02_hi, AND(-1-L03_hi, L04_hi)), XOR(L03_hi, AND(-1-L04_hi, L05_hi)), XOR(L04_hi, AND(-1-L05_hi, L01_hi)), XOR(L05_hi, AND(-1-L01_hi, L02_hi))
  3451. L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = XOR(L09_lo, AND(-1-L10_lo, L06_lo)), XOR(L10_lo, AND(-1-L06_lo, L07_lo)), XOR(L06_lo, AND(-1-L07_lo, L08_lo)), XOR(L07_lo, AND(-1-L08_lo, L09_lo)), XOR(L08_lo, AND(-1-L09_lo, L10_lo))
  3452. L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = XOR(L09_hi, AND(-1-L10_hi, L06_hi)), XOR(L10_hi, AND(-1-L06_hi, L07_hi)), XOR(L06_hi, AND(-1-L07_hi, L08_hi)), XOR(L07_hi, AND(-1-L08_hi, L09_hi)), XOR(L08_hi, AND(-1-L09_hi, L10_hi))
  3453. L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = XOR(L12_lo, AND(-1-L13_lo, L14_lo)), XOR(L13_lo, AND(-1-L14_lo, L15_lo)), XOR(L14_lo, AND(-1-L15_lo, L11_lo)), XOR(L15_lo, AND(-1-L11_lo, L12_lo)), XOR(L11_lo, AND(-1-L12_lo, L13_lo))
  3454. L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = XOR(L12_hi, AND(-1-L13_hi, L14_hi)), XOR(L13_hi, AND(-1-L14_hi, L15_hi)), XOR(L14_hi, AND(-1-L15_hi, L11_hi)), XOR(L15_hi, AND(-1-L11_hi, L12_hi)), XOR(L11_hi, AND(-1-L12_hi, L13_hi))
  3455. L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = XOR(L20_lo, AND(-1-L16_lo, L17_lo)), XOR(L16_lo, AND(-1-L17_lo, L18_lo)), XOR(L17_lo, AND(-1-L18_lo, L19_lo)), XOR(L18_lo, AND(-1-L19_lo, L20_lo)), XOR(L19_lo, AND(-1-L20_lo, L16_lo))
  3456. L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = XOR(L20_hi, AND(-1-L16_hi, L17_hi)), XOR(L16_hi, AND(-1-L17_hi, L18_hi)), XOR(L17_hi, AND(-1-L18_hi, L19_hi)), XOR(L18_hi, AND(-1-L19_hi, L20_hi)), XOR(L19_hi, AND(-1-L20_hi, L16_hi))
  3457. L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = XOR(L23_lo, AND(-1-L24_lo, L25_lo)), XOR(L24_lo, AND(-1-L25_lo, L21_lo)), XOR(L25_lo, AND(-1-L21_lo, L22_lo)), XOR(L21_lo, AND(-1-L22_lo, L23_lo)), XOR(L22_lo, AND(-1-L23_lo, L24_lo))
  3458. L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = XOR(L23_hi, AND(-1-L24_hi, L25_hi)), XOR(L24_hi, AND(-1-L25_hi, L21_hi)), XOR(L25_hi, AND(-1-L21_hi, L22_hi)), XOR(L21_hi, AND(-1-L22_hi, L23_hi)), XOR(L22_hi, AND(-1-L23_hi, L24_hi))
  3459. L01_lo = XOR(L01_lo, RC_lo[round_idx])
  3460. L01_hi = L01_hi + RC_hi[round_idx] -- RC_hi[] is either 0 or 0x80000000, so we could use fast addition instead of slow XOR
  3461. end
  3462. lanes_lo[1] = L01_lo; lanes_hi[1] = L01_hi
  3463. lanes_lo[2] = L02_lo; lanes_hi[2] = L02_hi
  3464. lanes_lo[3] = L03_lo; lanes_hi[3] = L03_hi
  3465. lanes_lo[4] = L04_lo; lanes_hi[4] = L04_hi
  3466. lanes_lo[5] = L05_lo; lanes_hi[5] = L05_hi
  3467. lanes_lo[6] = L06_lo; lanes_hi[6] = L06_hi
  3468. lanes_lo[7] = L07_lo; lanes_hi[7] = L07_hi
  3469. lanes_lo[8] = L08_lo; lanes_hi[8] = L08_hi
  3470. lanes_lo[9] = L09_lo; lanes_hi[9] = L09_hi
  3471. lanes_lo[10] = L10_lo; lanes_hi[10] = L10_hi
  3472. lanes_lo[11] = L11_lo; lanes_hi[11] = L11_hi
  3473. lanes_lo[12] = L12_lo; lanes_hi[12] = L12_hi
  3474. lanes_lo[13] = L13_lo; lanes_hi[13] = L13_hi
  3475. lanes_lo[14] = L14_lo; lanes_hi[14] = L14_hi
  3476. lanes_lo[15] = L15_lo; lanes_hi[15] = L15_hi
  3477. lanes_lo[16] = L16_lo; lanes_hi[16] = L16_hi
  3478. lanes_lo[17] = L17_lo; lanes_hi[17] = L17_hi
  3479. lanes_lo[18] = L18_lo; lanes_hi[18] = L18_hi
  3480. lanes_lo[19] = L19_lo; lanes_hi[19] = L19_hi
  3481. lanes_lo[20] = L20_lo; lanes_hi[20] = L20_hi
  3482. lanes_lo[21] = L21_lo; lanes_hi[21] = L21_hi
  3483. lanes_lo[22] = L22_lo; lanes_hi[22] = L22_hi
  3484. lanes_lo[23] = L23_lo; lanes_hi[23] = L23_hi
  3485. lanes_lo[24] = L24_lo; lanes_hi[24] = L24_hi
  3486. lanes_lo[25] = L25_lo; lanes_hi[25] = L25_hi
  3487. end
  3488. end
  3489. function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  3490. -- offs >= 0, size >= 0, size is multiple of 64
  3491. local W = common_W
  3492. local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  3493. for pos = offs, offs + size - 1, 64 do
  3494. if str then
  3495. for j = 1, 16 do
  3496. pos = pos + 4
  3497. local a, b, c, d = byte(str, pos - 3, pos)
  3498. W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3499. end
  3500. end
  3501. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  3502. local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  3503. bytes_compressed = bytes_compressed + (last_block_size or 64)
  3504. local t0 = bytes_compressed % 2^32
  3505. local t1 = (bytes_compressed - t0) / 2^32
  3506. vC = XOR(vC, t0) -- t0 = low_4_bytes(bytes_compressed)
  3507. vD = XOR(vD, t1) -- t1 = high_4_bytes(bytes_compressed)
  3508. if last_block_size then -- flag f0
  3509. vE = -1 - vE
  3510. end
  3511. if is_last_node then -- flag f1
  3512. vF = -1 - vF
  3513. end
  3514. for j = 1, 10 do
  3515. local row = sigma[j]
  3516. v0 = v0 + v4 + W[row[1]]
  3517. vC = XOR(vC, v0) % 2^32 / 2^16
  3518. vC = vC % 1 * (2^32 - 1) + vC
  3519. v8 = v8 + vC
  3520. v4 = XOR(v4, v8) % 2^32 / 2^12
  3521. v4 = v4 % 1 * (2^32 - 1) + v4
  3522. v0 = v0 + v4 + W[row[2]]
  3523. vC = XOR(vC, v0) % 2^32 / 2^8
  3524. vC = vC % 1 * (2^32 - 1) + vC
  3525. v8 = v8 + vC
  3526. v4 = XOR(v4, v8) % 2^32 / 2^7
  3527. v4 = v4 % 1 * (2^32 - 1) + v4
  3528. v1 = v1 + v5 + W[row[3]]
  3529. vD = XOR(vD, v1) % 2^32 / 2^16
  3530. vD = vD % 1 * (2^32 - 1) + vD
  3531. v9 = v9 + vD
  3532. v5 = XOR(v5, v9) % 2^32 / 2^12
  3533. v5 = v5 % 1 * (2^32 - 1) + v5
  3534. v1 = v1 + v5 + W[row[4]]
  3535. vD = XOR(vD, v1) % 2^32 / 2^8
  3536. vD = vD % 1 * (2^32 - 1) + vD
  3537. v9 = v9 + vD
  3538. v5 = XOR(v5, v9) % 2^32 / 2^7
  3539. v5 = v5 % 1 * (2^32 - 1) + v5
  3540. v2 = v2 + v6 + W[row[5]]
  3541. vE = XOR(vE, v2) % 2^32 / 2^16
  3542. vE = vE % 1 * (2^32 - 1) + vE
  3543. vA = vA + vE
  3544. v6 = XOR(v6, vA) % 2^32 / 2^12
  3545. v6 = v6 % 1 * (2^32 - 1) + v6
  3546. v2 = v2 + v6 + W[row[6]]
  3547. vE = XOR(vE, v2) % 2^32 / 2^8
  3548. vE = vE % 1 * (2^32 - 1) + vE
  3549. vA = vA + vE
  3550. v6 = XOR(v6, vA) % 2^32 / 2^7
  3551. v6 = v6 % 1 * (2^32 - 1) + v6
  3552. v3 = v3 + v7 + W[row[7]]
  3553. vF = XOR(vF, v3) % 2^32 / 2^16
  3554. vF = vF % 1 * (2^32 - 1) + vF
  3555. vB = vB + vF
  3556. v7 = XOR(v7, vB) % 2^32 / 2^12
  3557. v7 = v7 % 1 * (2^32 - 1) + v7
  3558. v3 = v3 + v7 + W[row[8]]
  3559. vF = XOR(vF, v3) % 2^32 / 2^8
  3560. vF = vF % 1 * (2^32 - 1) + vF
  3561. vB = vB + vF
  3562. v7 = XOR(v7, vB) % 2^32 / 2^7
  3563. v7 = v7 % 1 * (2^32 - 1) + v7
  3564. v0 = v0 + v5 + W[row[9]]
  3565. vF = XOR(vF, v0) % 2^32 / 2^16
  3566. vF = vF % 1 * (2^32 - 1) + vF
  3567. vA = vA + vF
  3568. v5 = XOR(v5, vA) % 2^32 / 2^12
  3569. v5 = v5 % 1 * (2^32 - 1) + v5
  3570. v0 = v0 + v5 + W[row[10]]
  3571. vF = XOR(vF, v0) % 2^32 / 2^8
  3572. vF = vF % 1 * (2^32 - 1) + vF
  3573. vA = vA + vF
  3574. v5 = XOR(v5, vA) % 2^32 / 2^7
  3575. v5 = v5 % 1 * (2^32 - 1) + v5
  3576. v1 = v1 + v6 + W[row[11]]
  3577. vC = XOR(vC, v1) % 2^32 / 2^16
  3578. vC = vC % 1 * (2^32 - 1) + vC
  3579. vB = vB + vC
  3580. v6 = XOR(v6, vB) % 2^32 / 2^12
  3581. v6 = v6 % 1 * (2^32 - 1) + v6
  3582. v1 = v1 + v6 + W[row[12]]
  3583. vC = XOR(vC, v1) % 2^32 / 2^8
  3584. vC = vC % 1 * (2^32 - 1) + vC
  3585. vB = vB + vC
  3586. v6 = XOR(v6, vB) % 2^32 / 2^7
  3587. v6 = v6 % 1 * (2^32 - 1) + v6
  3588. v2 = v2 + v7 + W[row[13]]
  3589. vD = XOR(vD, v2) % 2^32 / 2^16
  3590. vD = vD % 1 * (2^32 - 1) + vD
  3591. v8 = v8 + vD
  3592. v7 = XOR(v7, v8) % 2^32 / 2^12
  3593. v7 = v7 % 1 * (2^32 - 1) + v7
  3594. v2 = v2 + v7 + W[row[14]]
  3595. vD = XOR(vD, v2) % 2^32 / 2^8
  3596. vD = vD % 1 * (2^32 - 1) + vD
  3597. v8 = v8 + vD
  3598. v7 = XOR(v7, v8) % 2^32 / 2^7
  3599. v7 = v7 % 1 * (2^32 - 1) + v7
  3600. v3 = v3 + v4 + W[row[15]]
  3601. vE = XOR(vE, v3) % 2^32 / 2^16
  3602. vE = vE % 1 * (2^32 - 1) + vE
  3603. v9 = v9 + vE
  3604. v4 = XOR(v4, v9) % 2^32 / 2^12
  3605. v4 = v4 % 1 * (2^32 - 1) + v4
  3606. v3 = v3 + v4 + W[row[16]]
  3607. vE = XOR(vE, v3) % 2^32 / 2^8
  3608. vE = vE % 1 * (2^32 - 1) + vE
  3609. v9 = v9 + vE
  3610. v4 = XOR(v4, v9) % 2^32 / 2^7
  3611. v4 = v4 % 1 * (2^32 - 1) + v4
  3612. end
  3613. h1 = XOR(h1, v0, v8)
  3614. h2 = XOR(h2, v1, v9)
  3615. h3 = XOR(h3, v2, vA)
  3616. h4 = XOR(h4, v3, vB)
  3617. h5 = XOR(h5, v4, vC)
  3618. h6 = XOR(h6, v5, vD)
  3619. h7 = XOR(h7, v6, vE)
  3620. h8 = XOR(h8, v7, vF)
  3621. end
  3622. H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3623. return bytes_compressed
  3624. end
  3625. function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  3626. -- offs >= 0, size >= 0, size is multiple of 128
  3627. local W = common_W
  3628. local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  3629. local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  3630. for pos = offs, offs + size - 1, 128 do
  3631. if str then
  3632. for j = 1, 32 do
  3633. pos = pos + 4
  3634. local a, b, c, d = byte(str, pos - 3, pos)
  3635. W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3636. end
  3637. end
  3638. local v0_lo, v1_lo, v2_lo, v3_lo, v4_lo, v5_lo, v6_lo, v7_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3639. local v0_hi, v1_hi, v2_hi, v3_hi, v4_hi, v5_hi, v6_hi, v7_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3640. local v8_lo, v9_lo, vA_lo, vB_lo, vC_lo, vD_lo, vE_lo, vF_lo = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  3641. local v8_hi, v9_hi, vA_hi, vB_hi, vC_hi, vD_hi, vE_hi, vF_hi = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  3642. bytes_compressed = bytes_compressed + (last_block_size or 128)
  3643. local t0_lo = bytes_compressed % 2^32
  3644. local t0_hi = (bytes_compressed - t0_lo) / 2^32
  3645. vC_lo = XOR(vC_lo, t0_lo) -- t0 = low_8_bytes(bytes_compressed)
  3646. vC_hi = XOR(vC_hi, t0_hi)
  3647. -- t1 = high_8_bytes(bytes_compressed) = 0, message length is always below 2^53 bytes
  3648. if last_block_size then -- flag f0
  3649. vE_lo = -1 - vE_lo
  3650. vE_hi = -1 - vE_hi
  3651. end
  3652. if is_last_node then -- flag f1
  3653. vF_lo = -1 - vF_lo
  3654. vF_hi = -1 - vF_hi
  3655. end
  3656. for j = 1, 12 do
  3657. local row = sigma[j]
  3658. local k = row[1] * 2
  3659. local z = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1]
  3660. v0_lo = z % 2^32
  3661. v0_hi = v0_hi + v4_hi + (z - v0_lo) / 2^32 + W[k]
  3662. vC_lo, vC_hi = XOR(vC_hi, v0_hi), XOR(vC_lo, v0_lo)
  3663. z = v8_lo % 2^32 + vC_lo % 2^32
  3664. v8_lo = z % 2^32
  3665. v8_hi = v8_hi + vC_hi + (z - v8_lo) / 2^32
  3666. v4_lo, v4_hi = XOR(v4_lo, v8_lo), XOR(v4_hi, v8_hi)
  3667. local z_lo, z_hi = v4_lo % 2^24, v4_hi % 2^24
  3668. v4_lo, v4_hi = (v4_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v4_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3669. k = row[2] * 2
  3670. z = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1]
  3671. v0_lo = z % 2^32
  3672. v0_hi = v0_hi + v4_hi + (z - v0_lo) / 2^32 + W[k]
  3673. vC_lo, vC_hi = XOR(vC_lo, v0_lo), XOR(vC_hi, v0_hi)
  3674. z_lo, z_hi = vC_lo % 2^16, vC_hi % 2^16
  3675. vC_lo, vC_hi = (vC_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vC_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3676. z = v8_lo % 2^32 + vC_lo % 2^32
  3677. v8_lo = z % 2^32
  3678. v8_hi = v8_hi + vC_hi + (z - v8_lo) / 2^32
  3679. v4_lo, v4_hi = XOR(v4_lo, v8_lo), XOR(v4_hi, v8_hi)
  3680. z_lo, z_hi = v4_lo % 2^31, v4_hi % 2^31
  3681. v4_lo, v4_hi = z_lo * 2^1 + (v4_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v4_lo - z_lo) / 2^31 % 2^1
  3682. k = row[3] * 2
  3683. z = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1]
  3684. v1_lo = z % 2^32
  3685. v1_hi = v1_hi + v5_hi + (z - v1_lo) / 2^32 + W[k]
  3686. vD_lo, vD_hi = XOR(vD_hi, v1_hi), XOR(vD_lo, v1_lo)
  3687. z = v9_lo % 2^32 + vD_lo % 2^32
  3688. v9_lo = z % 2^32
  3689. v9_hi = v9_hi + vD_hi + (z - v9_lo) / 2^32
  3690. v5_lo, v5_hi = XOR(v5_lo, v9_lo), XOR(v5_hi, v9_hi)
  3691. z_lo, z_hi = v5_lo % 2^24, v5_hi % 2^24
  3692. v5_lo, v5_hi = (v5_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v5_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3693. k = row[4] * 2
  3694. z = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1]
  3695. v1_lo = z % 2^32
  3696. v1_hi = v1_hi + v5_hi + (z - v1_lo) / 2^32 + W[k]
  3697. vD_lo, vD_hi = XOR(vD_lo, v1_lo), XOR(vD_hi, v1_hi)
  3698. z_lo, z_hi = vD_lo % 2^16, vD_hi % 2^16
  3699. vD_lo, vD_hi = (vD_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vD_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3700. z = v9_lo % 2^32 + vD_lo % 2^32
  3701. v9_lo = z % 2^32
  3702. v9_hi = v9_hi + vD_hi + (z - v9_lo) / 2^32
  3703. v5_lo, v5_hi = XOR(v5_lo, v9_lo), XOR(v5_hi, v9_hi)
  3704. z_lo, z_hi = v5_lo % 2^31, v5_hi % 2^31
  3705. v5_lo, v5_hi = z_lo * 2^1 + (v5_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v5_lo - z_lo) / 2^31 % 2^1
  3706. k = row[5] * 2
  3707. z = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1]
  3708. v2_lo = z % 2^32
  3709. v2_hi = v2_hi + v6_hi + (z - v2_lo) / 2^32 + W[k]
  3710. vE_lo, vE_hi = XOR(vE_hi, v2_hi), XOR(vE_lo, v2_lo)
  3711. z = vA_lo % 2^32 + vE_lo % 2^32
  3712. vA_lo = z % 2^32
  3713. vA_hi = vA_hi + vE_hi + (z - vA_lo) / 2^32
  3714. v6_lo, v6_hi = XOR(v6_lo, vA_lo), XOR(v6_hi, vA_hi)
  3715. z_lo, z_hi = v6_lo % 2^24, v6_hi % 2^24
  3716. v6_lo, v6_hi = (v6_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v6_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3717. k = row[6] * 2
  3718. z = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1]
  3719. v2_lo = z % 2^32
  3720. v2_hi = v2_hi + v6_hi + (z - v2_lo) / 2^32 + W[k]
  3721. vE_lo, vE_hi = XOR(vE_lo, v2_lo), XOR(vE_hi, v2_hi)
  3722. z_lo, z_hi = vE_lo % 2^16, vE_hi % 2^16
  3723. vE_lo, vE_hi = (vE_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vE_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3724. z = vA_lo % 2^32 + vE_lo % 2^32
  3725. vA_lo = z % 2^32
  3726. vA_hi = vA_hi + vE_hi + (z - vA_lo) / 2^32
  3727. v6_lo, v6_hi = XOR(v6_lo, vA_lo), XOR(v6_hi, vA_hi)
  3728. z_lo, z_hi = v6_lo % 2^31, v6_hi % 2^31
  3729. v6_lo, v6_hi = z_lo * 2^1 + (v6_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v6_lo - z_lo) / 2^31 % 2^1
  3730. k = row[7] * 2
  3731. z = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1]
  3732. v3_lo = z % 2^32
  3733. v3_hi = v3_hi + v7_hi + (z - v3_lo) / 2^32 + W[k]
  3734. vF_lo, vF_hi = XOR(vF_hi, v3_hi), XOR(vF_lo, v3_lo)
  3735. z = vB_lo % 2^32 + vF_lo % 2^32
  3736. vB_lo = z % 2^32
  3737. vB_hi = vB_hi + vF_hi + (z - vB_lo) / 2^32
  3738. v7_lo, v7_hi = XOR(v7_lo, vB_lo), XOR(v7_hi, vB_hi)
  3739. z_lo, z_hi = v7_lo % 2^24, v7_hi % 2^24
  3740. v7_lo, v7_hi = (v7_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v7_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3741. k = row[8] * 2
  3742. z = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1]
  3743. v3_lo = z % 2^32
  3744. v3_hi = v3_hi + v7_hi + (z - v3_lo) / 2^32 + W[k]
  3745. vF_lo, vF_hi = XOR(vF_lo, v3_lo), XOR(vF_hi, v3_hi)
  3746. z_lo, z_hi = vF_lo % 2^16, vF_hi % 2^16
  3747. vF_lo, vF_hi = (vF_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vF_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3748. z = vB_lo % 2^32 + vF_lo % 2^32
  3749. vB_lo = z % 2^32
  3750. vB_hi = vB_hi + vF_hi + (z - vB_lo) / 2^32
  3751. v7_lo, v7_hi = XOR(v7_lo, vB_lo), XOR(v7_hi, vB_hi)
  3752. z_lo, z_hi = v7_lo % 2^31, v7_hi % 2^31
  3753. v7_lo, v7_hi = z_lo * 2^1 + (v7_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v7_lo - z_lo) / 2^31 % 2^1
  3754. k = row[9] * 2
  3755. z = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1]
  3756. v0_lo = z % 2^32
  3757. v0_hi = v0_hi + v5_hi + (z - v0_lo) / 2^32 + W[k]
  3758. vF_lo, vF_hi = XOR(vF_hi, v0_hi), XOR(vF_lo, v0_lo)
  3759. z = vA_lo % 2^32 + vF_lo % 2^32
  3760. vA_lo = z % 2^32
  3761. vA_hi = vA_hi + vF_hi + (z - vA_lo) / 2^32
  3762. v5_lo, v5_hi = XOR(v5_lo, vA_lo), XOR(v5_hi, vA_hi)
  3763. z_lo, z_hi = v5_lo % 2^24, v5_hi % 2^24
  3764. v5_lo, v5_hi = (v5_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v5_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3765. k = row[10] * 2
  3766. z = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1]
  3767. v0_lo = z % 2^32
  3768. v0_hi = v0_hi + v5_hi + (z - v0_lo) / 2^32 + W[k]
  3769. vF_lo, vF_hi = XOR(vF_lo, v0_lo), XOR(vF_hi, v0_hi)
  3770. z_lo, z_hi = vF_lo % 2^16, vF_hi % 2^16
  3771. vF_lo, vF_hi = (vF_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vF_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3772. z = vA_lo % 2^32 + vF_lo % 2^32
  3773. vA_lo = z % 2^32
  3774. vA_hi = vA_hi + vF_hi + (z - vA_lo) / 2^32
  3775. v5_lo, v5_hi = XOR(v5_lo, vA_lo), XOR(v5_hi, vA_hi)
  3776. z_lo, z_hi = v5_lo % 2^31, v5_hi % 2^31
  3777. v5_lo, v5_hi = z_lo * 2^1 + (v5_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v5_lo - z_lo) / 2^31 % 2^1
  3778. k = row[11] * 2
  3779. z = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1]
  3780. v1_lo = z % 2^32
  3781. v1_hi = v1_hi + v6_hi + (z - v1_lo) / 2^32 + W[k]
  3782. vC_lo, vC_hi = XOR(vC_hi, v1_hi), XOR(vC_lo, v1_lo)
  3783. z = vB_lo % 2^32 + vC_lo % 2^32
  3784. vB_lo = z % 2^32
  3785. vB_hi = vB_hi + vC_hi + (z - vB_lo) / 2^32
  3786. v6_lo, v6_hi = XOR(v6_lo, vB_lo), XOR(v6_hi, vB_hi)
  3787. z_lo, z_hi = v6_lo % 2^24, v6_hi % 2^24
  3788. v6_lo, v6_hi = (v6_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v6_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3789. k = row[12] * 2
  3790. z = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1]
  3791. v1_lo = z % 2^32
  3792. v1_hi = v1_hi + v6_hi + (z - v1_lo) / 2^32 + W[k]
  3793. vC_lo, vC_hi = XOR(vC_lo, v1_lo), XOR(vC_hi, v1_hi)
  3794. z_lo, z_hi = vC_lo % 2^16, vC_hi % 2^16
  3795. vC_lo, vC_hi = (vC_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vC_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3796. z = vB_lo % 2^32 + vC_lo % 2^32
  3797. vB_lo = z % 2^32
  3798. vB_hi = vB_hi + vC_hi + (z - vB_lo) / 2^32
  3799. v6_lo, v6_hi = XOR(v6_lo, vB_lo), XOR(v6_hi, vB_hi)
  3800. z_lo, z_hi = v6_lo % 2^31, v6_hi % 2^31
  3801. v6_lo, v6_hi = z_lo * 2^1 + (v6_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v6_lo - z_lo) / 2^31 % 2^1
  3802. k = row[13] * 2
  3803. z = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1]
  3804. v2_lo = z % 2^32
  3805. v2_hi = v2_hi + v7_hi + (z - v2_lo) / 2^32 + W[k]
  3806. vD_lo, vD_hi = XOR(vD_hi, v2_hi), XOR(vD_lo, v2_lo)
  3807. z = v8_lo % 2^32 + vD_lo % 2^32
  3808. v8_lo = z % 2^32
  3809. v8_hi = v8_hi + vD_hi + (z - v8_lo) / 2^32
  3810. v7_lo, v7_hi = XOR(v7_lo, v8_lo), XOR(v7_hi, v8_hi)
  3811. z_lo, z_hi = v7_lo % 2^24, v7_hi % 2^24
  3812. v7_lo, v7_hi = (v7_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v7_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3813. k = row[14] * 2
  3814. z = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1]
  3815. v2_lo = z % 2^32
  3816. v2_hi = v2_hi + v7_hi + (z - v2_lo) / 2^32 + W[k]
  3817. vD_lo, vD_hi = XOR(vD_lo, v2_lo), XOR(vD_hi, v2_hi)
  3818. z_lo, z_hi = vD_lo % 2^16, vD_hi % 2^16
  3819. vD_lo, vD_hi = (vD_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vD_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3820. z = v8_lo % 2^32 + vD_lo % 2^32
  3821. v8_lo = z % 2^32
  3822. v8_hi = v8_hi + vD_hi + (z - v8_lo) / 2^32
  3823. v7_lo, v7_hi = XOR(v7_lo, v8_lo), XOR(v7_hi, v8_hi)
  3824. z_lo, z_hi = v7_lo % 2^31, v7_hi % 2^31
  3825. v7_lo, v7_hi = z_lo * 2^1 + (v7_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v7_lo - z_lo) / 2^31 % 2^1
  3826. k = row[15] * 2
  3827. z = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1]
  3828. v3_lo = z % 2^32
  3829. v3_hi = v3_hi + v4_hi + (z - v3_lo) / 2^32 + W[k]
  3830. vE_lo, vE_hi = XOR(vE_hi, v3_hi), XOR(vE_lo, v3_lo)
  3831. z = v9_lo % 2^32 + vE_lo % 2^32
  3832. v9_lo = z % 2^32
  3833. v9_hi = v9_hi + vE_hi + (z - v9_lo) / 2^32
  3834. v4_lo, v4_hi = XOR(v4_lo, v9_lo), XOR(v4_hi, v9_hi)
  3835. z_lo, z_hi = v4_lo % 2^24, v4_hi % 2^24
  3836. v4_lo, v4_hi = (v4_lo - z_lo) / 2^24 % 2^8 + z_hi * 2^8, (v4_hi - z_hi) / 2^24 % 2^8 + z_lo * 2^8
  3837. k = row[16] * 2
  3838. z = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1]
  3839. v3_lo = z % 2^32
  3840. v3_hi = v3_hi + v4_hi + (z - v3_lo) / 2^32 + W[k]
  3841. vE_lo, vE_hi = XOR(vE_lo, v3_lo), XOR(vE_hi, v3_hi)
  3842. z_lo, z_hi = vE_lo % 2^16, vE_hi % 2^16
  3843. vE_lo, vE_hi = (vE_lo - z_lo) / 2^16 % 2^16 + z_hi * 2^16, (vE_hi - z_hi) / 2^16 % 2^16 + z_lo * 2^16
  3844. z = v9_lo % 2^32 + vE_lo % 2^32
  3845. v9_lo = z % 2^32
  3846. v9_hi = v9_hi + vE_hi + (z - v9_lo) / 2^32
  3847. v4_lo, v4_hi = XOR(v4_lo, v9_lo), XOR(v4_hi, v9_hi)
  3848. z_lo, z_hi = v4_lo % 2^31, v4_hi % 2^31
  3849. v4_lo, v4_hi = z_lo * 2^1 + (v4_hi - z_hi) / 2^31 % 2^1, z_hi * 2^1 + (v4_lo - z_lo) / 2^31 % 2^1
  3850. end
  3851. h1_lo = XOR(h1_lo, v0_lo, v8_lo) % 2^32
  3852. h2_lo = XOR(h2_lo, v1_lo, v9_lo) % 2^32
  3853. h3_lo = XOR(h3_lo, v2_lo, vA_lo) % 2^32
  3854. h4_lo = XOR(h4_lo, v3_lo, vB_lo) % 2^32
  3855. h5_lo = XOR(h5_lo, v4_lo, vC_lo) % 2^32
  3856. h6_lo = XOR(h6_lo, v5_lo, vD_lo) % 2^32
  3857. h7_lo = XOR(h7_lo, v6_lo, vE_lo) % 2^32
  3858. h8_lo = XOR(h8_lo, v7_lo, vF_lo) % 2^32
  3859. h1_hi = XOR(h1_hi, v0_hi, v8_hi) % 2^32
  3860. h2_hi = XOR(h2_hi, v1_hi, v9_hi) % 2^32
  3861. h3_hi = XOR(h3_hi, v2_hi, vA_hi) % 2^32
  3862. h4_hi = XOR(h4_hi, v3_hi, vB_hi) % 2^32
  3863. h5_hi = XOR(h5_hi, v4_hi, vC_hi) % 2^32
  3864. h6_hi = XOR(h6_hi, v5_hi, vD_hi) % 2^32
  3865. h7_hi = XOR(h7_hi, v6_hi, vE_hi) % 2^32
  3866. h8_hi = XOR(h8_hi, v7_hi, vF_hi) % 2^32
  3867. end
  3868. H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3869. H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3870. return bytes_compressed
  3871. end
  3872. function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  3873. -- offs >= 0, size >= 0, size is multiple of 64
  3874. block_length = block_length or 64
  3875. local W = common_W
  3876. local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
  3877. H_out = H_out or H_in
  3878. for pos = offs, offs + size - 1, 64 do
  3879. if str then
  3880. for j = 1, 16 do
  3881. pos = pos + 4
  3882. local a, b, c, d = byte(str, pos - 3, pos)
  3883. W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3884. end
  3885. end
  3886. local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  3887. local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
  3888. local vC = chunk_index % 2^32 -- t0 = low_4_bytes(chunk_index)
  3889. local vD = (chunk_index - vC) / 2^32 -- t1 = high_4_bytes(chunk_index)
  3890. local vE, vF = block_length, flags
  3891. for j = 1, 7 do
  3892. v0 = v0 + v4 + W[perm_blake3[j]]
  3893. vC = XOR(vC, v0) % 2^32 / 2^16
  3894. vC = vC % 1 * (2^32 - 1) + vC
  3895. v8 = v8 + vC
  3896. v4 = XOR(v4, v8) % 2^32 / 2^12
  3897. v4 = v4 % 1 * (2^32 - 1) + v4
  3898. v0 = v0 + v4 + W[perm_blake3[j + 14]]
  3899. vC = XOR(vC, v0) % 2^32 / 2^8
  3900. vC = vC % 1 * (2^32 - 1) + vC
  3901. v8 = v8 + vC
  3902. v4 = XOR(v4, v8) % 2^32 / 2^7
  3903. v4 = v4 % 1 * (2^32 - 1) + v4
  3904. v1 = v1 + v5 + W[perm_blake3[j + 1]]
  3905. vD = XOR(vD, v1) % 2^32 / 2^16
  3906. vD = vD % 1 * (2^32 - 1) + vD
  3907. v9 = v9 + vD
  3908. v5 = XOR(v5, v9) % 2^32 / 2^12
  3909. v5 = v5 % 1 * (2^32 - 1) + v5
  3910. v1 = v1 + v5 + W[perm_blake3[j + 2]]
  3911. vD = XOR(vD, v1) % 2^32 / 2^8
  3912. vD = vD % 1 * (2^32 - 1) + vD
  3913. v9 = v9 + vD
  3914. v5 = XOR(v5, v9) % 2^32 / 2^7
  3915. v5 = v5 % 1 * (2^32 - 1) + v5
  3916. v2 = v2 + v6 + W[perm_blake3[j + 16]]
  3917. vE = XOR(vE, v2) % 2^32 / 2^16
  3918. vE = vE % 1 * (2^32 - 1) + vE
  3919. vA = vA + vE
  3920. v6 = XOR(v6, vA) % 2^32 / 2^12
  3921. v6 = v6 % 1 * (2^32 - 1) + v6
  3922. v2 = v2 + v6 + W[perm_blake3[j + 7]]
  3923. vE = XOR(vE, v2) % 2^32 / 2^8
  3924. vE = vE % 1 * (2^32 - 1) + vE
  3925. vA = vA + vE
  3926. v6 = XOR(v6, vA) % 2^32 / 2^7
  3927. v6 = v6 % 1 * (2^32 - 1) + v6
  3928. v3 = v3 + v7 + W[perm_blake3[j + 15]]
  3929. vF = XOR(vF, v3) % 2^32 / 2^16
  3930. vF = vF % 1 * (2^32 - 1) + vF
  3931. vB = vB + vF
  3932. v7 = XOR(v7, vB) % 2^32 / 2^12
  3933. v7 = v7 % 1 * (2^32 - 1) + v7
  3934. v3 = v3 + v7 + W[perm_blake3[j + 17]]
  3935. vF = XOR(vF, v3) % 2^32 / 2^8
  3936. vF = vF % 1 * (2^32 - 1) + vF
  3937. vB = vB + vF
  3938. v7 = XOR(v7, vB) % 2^32 / 2^7
  3939. v7 = v7 % 1 * (2^32 - 1) + v7
  3940. v0 = v0 + v5 + W[perm_blake3[j + 21]]
  3941. vF = XOR(vF, v0) % 2^32 / 2^16
  3942. vF = vF % 1 * (2^32 - 1) + vF
  3943. vA = vA + vF
  3944. v5 = XOR(v5, vA) % 2^32 / 2^12
  3945. v5 = v5 % 1 * (2^32 - 1) + v5
  3946. v0 = v0 + v5 + W[perm_blake3[j + 5]]
  3947. vF = XOR(vF, v0) % 2^32 / 2^8
  3948. vF = vF % 1 * (2^32 - 1) + vF
  3949. vA = vA + vF
  3950. v5 = XOR(v5, vA) % 2^32 / 2^7
  3951. v5 = v5 % 1 * (2^32 - 1) + v5
  3952. v1 = v1 + v6 + W[perm_blake3[j + 3]]
  3953. vC = XOR(vC, v1) % 2^32 / 2^16
  3954. vC = vC % 1 * (2^32 - 1) + vC
  3955. vB = vB + vC
  3956. v6 = XOR(v6, vB) % 2^32 / 2^12
  3957. v6 = v6 % 1 * (2^32 - 1) + v6
  3958. v1 = v1 + v6 + W[perm_blake3[j + 6]]
  3959. vC = XOR(vC, v1) % 2^32 / 2^8
  3960. vC = vC % 1 * (2^32 - 1) + vC
  3961. vB = vB + vC
  3962. v6 = XOR(v6, vB) % 2^32 / 2^7
  3963. v6 = v6 % 1 * (2^32 - 1) + v6
  3964. v2 = v2 + v7 + W[perm_blake3[j + 4]]
  3965. vD = XOR(vD, v2) % 2^32 / 2^16
  3966. vD = vD % 1 * (2^32 - 1) + vD
  3967. v8 = v8 + vD
  3968. v7 = XOR(v7, v8) % 2^32 / 2^12
  3969. v7 = v7 % 1 * (2^32 - 1) + v7
  3970. v2 = v2 + v7 + W[perm_blake3[j + 18]]
  3971. vD = XOR(vD, v2) % 2^32 / 2^8
  3972. vD = vD % 1 * (2^32 - 1) + vD
  3973. v8 = v8 + vD
  3974. v7 = XOR(v7, v8) % 2^32 / 2^7
  3975. v7 = v7 % 1 * (2^32 - 1) + v7
  3976. v3 = v3 + v4 + W[perm_blake3[j + 19]]
  3977. vE = XOR(vE, v3) % 2^32 / 2^16
  3978. vE = vE % 1 * (2^32 - 1) + vE
  3979. v9 = v9 + vE
  3980. v4 = XOR(v4, v9) % 2^32 / 2^12
  3981. v4 = v4 % 1 * (2^32 - 1) + v4
  3982. v3 = v3 + v4 + W[perm_blake3[j + 20]]
  3983. vE = XOR(vE, v3) % 2^32 / 2^8
  3984. vE = vE % 1 * (2^32 - 1) + vE
  3985. v9 = v9 + vE
  3986. v4 = XOR(v4, v9) % 2^32 / 2^7
  3987. v4 = v4 % 1 * (2^32 - 1) + v4
  3988. end
  3989. if wide_output then
  3990. H_out[ 9] = XOR(h1, v8)
  3991. H_out[10] = XOR(h2, v9)
  3992. H_out[11] = XOR(h3, vA)
  3993. H_out[12] = XOR(h4, vB)
  3994. H_out[13] = XOR(h5, vC)
  3995. H_out[14] = XOR(h6, vD)
  3996. H_out[15] = XOR(h7, vE)
  3997. H_out[16] = XOR(h8, vF)
  3998. end
  3999. h1 = XOR(v0, v8)
  4000. h2 = XOR(v1, v9)
  4001. h3 = XOR(v2, vA)
  4002. h4 = XOR(v3, vB)
  4003. h5 = XOR(v4, vC)
  4004. h6 = XOR(v5, vD)
  4005. h7 = XOR(v6, vE)
  4006. h8 = XOR(v7, vF)
  4007. end
  4008. H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  4009. end
  4010. end
  4011. --------------------------------------------------------------------------------
  4012. -- MAGIC NUMBERS CALCULATOR
  4013. --------------------------------------------------------------------------------
  4014. -- Q:
  4015. -- Is 53-bit "double" math enough to calculate square roots and cube roots of primes with 64 correct bits after decimal point?
  4016. -- A:
  4017. -- Yes, 53-bit "double" arithmetic is enough.
  4018. -- We could obtain first 40 bits by direct calculation of p^(1/3) and next 40 bits by one step of Newton's method.
  4019. do
  4020. local function mul(src1, src2, factor, result_length)
  4021. -- src1, src2 - long integers (arrays of digits in base 2^24)
  4022. -- factor - small integer
  4023. -- returns long integer result (src1 * src2 * factor) and its floating point approximation
  4024. local result, carry, value, weight = {}, 0.0, 0.0, 1.0
  4025. for j = 1, result_length do
  4026. for k = math_max(1, j + 1 - #src2), math_min(j, #src1) do
  4027. carry = carry + factor * src1[k] * src2[j + 1 - k] -- "int32" is not enough for multiplication result, that's why "factor" must be of type "double"
  4028. end
  4029. local digit = carry % 2^24
  4030. result[j] = floor(digit)
  4031. carry = (carry - digit) / 2^24
  4032. value = value + digit * weight
  4033. weight = weight * 2^24
  4034. end
  4035. return result, value
  4036. end
  4037. local idx, step, p, one, sqrt_hi, sqrt_lo = 0, {4, 1, 2, -2, 2}, 4, {1}, sha2_H_hi, sha2_H_lo
  4038. repeat
  4039. p = p + step[p % 6]
  4040. local d = 1
  4041. repeat
  4042. d = d + step[d % 6]
  4043. if d*d > p then -- next prime number is found
  4044. local root = p^(1/3)
  4045. local R = root * 2^40
  4046. R = mul({R - R % 1}, one, 1.0, 2)
  4047. local _, delta = mul(R, mul(R, R, 1.0, 4), -1.0, 4)
  4048. local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
  4049. local lo = R[1] % 256 * 16777216 + floor(delta * (2^-56 / 3) * root / p)
  4050. if idx < 16 then
  4051. root = p^(1/2)
  4052. R = root * 2^40
  4053. R = mul({R - R % 1}, one, 1.0, 2)
  4054. _, delta = mul(R, R, -1.0, 2)
  4055. local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
  4056. local lo = R[1] % 256 * 16777216 + floor(delta * 2^-17 / root)
  4057. local idx = idx % 8 + 1
  4058. sha2_H_ext256[224][idx] = lo
  4059. sqrt_hi[idx], sqrt_lo[idx] = hi, lo + hi * hi_factor
  4060. if idx > 7 then
  4061. sqrt_hi, sqrt_lo = sha2_H_ext512_hi[384], sha2_H_ext512_lo[384]
  4062. end
  4063. end
  4064. idx = idx + 1
  4065. sha2_K_hi[idx], sha2_K_lo[idx] = hi, lo % K_lo_modulo + hi * hi_factor
  4066. break
  4067. end
  4068. until p % d == 0
  4069. until idx > 79
  4070. end
  4071. -- Calculating IVs for SHA512/224 and SHA512/256
  4072. for width = 224, 256, 32 do
  4073. local H_lo, H_hi = {}
  4074. if HEX64 then
  4075. for j = 1, 8 do
  4076. H_lo[j] = XORA5(sha2_H_lo[j])
  4077. end
  4078. else
  4079. H_hi = {}
  4080. for j = 1, 8 do
  4081. H_lo[j] = XORA5(sha2_H_lo[j])
  4082. H_hi[j] = XORA5(sha2_H_hi[j])
  4083. end
  4084. end
  4085. sha512_feed_128(H_lo, H_hi, "SHA-512/"..tostring(width).."\128"..string_rep("\0", 115).."\88", 0, 128)
  4086. sha2_H_ext512_lo[width] = H_lo
  4087. sha2_H_ext512_hi[width] = H_hi
  4088. end
  4089. -- Constants for MD5
  4090. do
  4091. local sin, abs, modf = math.sin, math.abs, math.modf
  4092. for idx = 1, 64 do
  4093. -- we can't use formula floor(abs(sin(idx))*2^32) because its result may be beyond integer range on Lua built with 32-bit integers
  4094. local hi, lo = modf(abs(sin(idx)) * 2^16)
  4095. md5_K[idx] = hi * 65536 + floor(lo * 2^16)
  4096. end
  4097. end
  4098. -- Constants for SHA-3
  4099. do
  4100. local sh_reg = 29
  4101. local function next_bit()
  4102. local r = sh_reg % 2
  4103. sh_reg = XOR_BYTE((sh_reg - r) / 2, 142 * r)
  4104. return r
  4105. end
  4106. for idx = 1, 24 do
  4107. local lo, m = 0
  4108. for _ = 1, 6 do
  4109. m = m and m * m * 2 or 1
  4110. lo = lo + next_bit() * m
  4111. end
  4112. local hi = next_bit() * m
  4113. sha3_RC_hi[idx], sha3_RC_lo[idx] = hi, lo + hi * hi_factor_keccak
  4114. end
  4115. end
  4116. if branch == "FFI" then
  4117. sha2_K_hi = ffi.new("uint32_t[?]", #sha2_K_hi + 1, 0, unpack(sha2_K_hi))
  4118. sha2_K_lo = ffi.new("int64_t[?]", #sha2_K_lo + 1, 0, unpack(sha2_K_lo))
  4119. --md5_K = ffi.new("uint32_t[?]", #md5_K + 1, 0, unpack(md5_K))
  4120. if hi_factor_keccak == 0 then
  4121. sha3_RC_lo = ffi.new("uint32_t[?]", #sha3_RC_lo + 1, 0, unpack(sha3_RC_lo))
  4122. sha3_RC_hi = ffi.new("uint32_t[?]", #sha3_RC_hi + 1, 0, unpack(sha3_RC_hi))
  4123. else
  4124. sha3_RC_lo = ffi.new("int64_t[?]", #sha3_RC_lo + 1, 0, unpack(sha3_RC_lo))
  4125. end
  4126. end
  4127. --------------------------------------------------------------------------------
  4128. -- MAIN FUNCTIONS
  4129. --------------------------------------------------------------------------------
  4130. local function sha256ext(width, message)
  4131. -- Create an instance (private objects for current calculation)
  4132. local H, length, tail = {unpack(sha2_H_ext256[width])}, 0.0, ""
  4133. local function partial(message_part)
  4134. if message_part then
  4135. if tail then
  4136. length = length + #message_part
  4137. local offs = 0
  4138. if tail ~= "" and #tail + #message_part >= 64 then
  4139. offs = 64 - #tail
  4140. sha256_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
  4141. tail = ""
  4142. end
  4143. local size = #message_part - offs
  4144. local size_tail = size % 64
  4145. sha256_feed_64(H, message_part, offs, size - size_tail)
  4146. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4147. return partial
  4148. else
  4149. error("Adding more chunks is not allowed after receiving the result", 2)
  4150. end
  4151. else
  4152. if tail then
  4153. local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
  4154. tail = nil
  4155. -- Assuming user data length is shorter than (2^53)-9 bytes
  4156. -- Anyway, it looks very unrealistic that someone would spend more than a year of calculations to process 2^53 bytes of data by using this Lua script :-)
  4157. -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
  4158. length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move decimal point to the left
  4159. for j = 4, 10 do
  4160. length = length % 1 * 256
  4161. final_blocks[j] = char(floor(length))
  4162. end
  4163. final_blocks = table_concat(final_blocks)
  4164. sha256_feed_64(H, final_blocks, 0, #final_blocks)
  4165. local max_reg = width / 32
  4166. for j = 1, max_reg do
  4167. H[j] = HEX(H[j])
  4168. end
  4169. H = table_concat(H, "", 1, max_reg)
  4170. end
  4171. return H
  4172. end
  4173. end
  4174. if message then
  4175. -- Actually perform calculations and return the SHA256 digest of a message
  4176. return partial(message)()
  4177. else
  4178. -- Return function for chunk-by-chunk loading
  4179. -- User should feed every chunk of input data as single argument to this function and finally get SHA256 digest by invoking this function without an argument
  4180. return partial
  4181. end
  4182. end
  4183. local function sha512ext(width, message)
  4184. -- Create an instance (private objects for current calculation)
  4185. local length, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_ext512_lo[width])}, not HEX64 and {unpack(sha2_H_ext512_hi[width])}
  4186. local function partial(message_part)
  4187. if message_part then
  4188. if tail then
  4189. length = length + #message_part
  4190. local offs = 0
  4191. if tail ~= "" and #tail + #message_part >= 128 then
  4192. offs = 128 - #tail
  4193. sha512_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128)
  4194. tail = ""
  4195. end
  4196. local size = #message_part - offs
  4197. local size_tail = size % 128
  4198. sha512_feed_128(H_lo, H_hi, message_part, offs, size - size_tail)
  4199. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4200. return partial
  4201. else
  4202. error("Adding more chunks is not allowed after receiving the result", 2)
  4203. end
  4204. else
  4205. if tail then
  4206. local final_blocks = {tail, "\128", string_rep("\0", (-17-length) % 128 + 9)}
  4207. tail = nil
  4208. -- Assuming user data length is shorter than (2^53)-17 bytes
  4209. -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
  4210. length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move floating point to the left
  4211. for j = 4, 10 do
  4212. length = length % 1 * 256
  4213. final_blocks[j] = char(floor(length))
  4214. end
  4215. final_blocks = table_concat(final_blocks)
  4216. sha512_feed_128(H_lo, H_hi, final_blocks, 0, #final_blocks)
  4217. local max_reg = ceil(width / 64)
  4218. if HEX64 then
  4219. for j = 1, max_reg do
  4220. H_lo[j] = HEX64(H_lo[j])
  4221. end
  4222. else
  4223. for j = 1, max_reg do
  4224. H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
  4225. end
  4226. H_hi = nil
  4227. end
  4228. H_lo = sub(table_concat(H_lo, "", 1, max_reg), 1, width / 4)
  4229. end
  4230. return H_lo
  4231. end
  4232. end
  4233. if message then
  4234. -- Actually perform calculations and return the SHA512 digest of a message
  4235. return partial(message)()
  4236. else
  4237. -- Return function for chunk-by-chunk loading
  4238. -- User should feed every chunk of input data as single argument to this function and finally get SHA512 digest by invoking this function without an argument
  4239. return partial
  4240. end
  4241. end
  4242. local function md5(message)
  4243. -- Create an instance (private objects for current calculation)
  4244. local H, length, tail = {unpack(md5_sha1_H, 1, 4)}, 0.0, ""
  4245. local function partial(message_part)
  4246. if message_part then
  4247. if tail then
  4248. length = length + #message_part
  4249. local offs = 0
  4250. if tail ~= "" and #tail + #message_part >= 64 then
  4251. offs = 64 - #tail
  4252. md5_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
  4253. tail = ""
  4254. end
  4255. local size = #message_part - offs
  4256. local size_tail = size % 64
  4257. md5_feed_64(H, message_part, offs, size - size_tail)
  4258. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4259. return partial
  4260. else
  4261. error("Adding more chunks is not allowed after receiving the result", 2)
  4262. end
  4263. else
  4264. if tail then
  4265. local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64)}
  4266. tail = nil
  4267. length = length * 8 -- convert "byte-counter" to "bit-counter"
  4268. for j = 4, 11 do
  4269. local low_byte = length % 256
  4270. final_blocks[j] = char(low_byte)
  4271. length = (length - low_byte) / 256
  4272. end
  4273. final_blocks = table_concat(final_blocks)
  4274. md5_feed_64(H, final_blocks, 0, #final_blocks)
  4275. for j = 1, 4 do
  4276. H[j] = HEX(H[j])
  4277. end
  4278. H = gsub(table_concat(H), "(..)(..)(..)(..)", "%4%3%2%1")
  4279. end
  4280. return H
  4281. end
  4282. end
  4283. if message then
  4284. -- Actually perform calculations and return the MD5 digest of a message
  4285. return partial(message)()
  4286. else
  4287. -- Return function for chunk-by-chunk loading
  4288. -- User should feed every chunk of input data as single argument to this function and finally get MD5 digest by invoking this function without an argument
  4289. return partial
  4290. end
  4291. end
  4292. local function sha1(message)
  4293. -- Create an instance (private objects for current calculation)
  4294. local H, length, tail = {unpack(md5_sha1_H)}, 0.0, ""
  4295. local function partial(message_part)
  4296. if message_part then
  4297. if tail then
  4298. length = length + #message_part
  4299. local offs = 0
  4300. if tail ~= "" and #tail + #message_part >= 64 then
  4301. offs = 64 - #tail
  4302. sha1_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
  4303. tail = ""
  4304. end
  4305. local size = #message_part - offs
  4306. local size_tail = size % 64
  4307. sha1_feed_64(H, message_part, offs, size - size_tail)
  4308. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4309. return partial
  4310. else
  4311. error("Adding more chunks is not allowed after receiving the result", 2)
  4312. end
  4313. else
  4314. if tail then
  4315. local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
  4316. tail = nil
  4317. -- Assuming user data length is shorter than (2^53)-9 bytes
  4318. -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
  4319. length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move decimal point to the left
  4320. for j = 4, 10 do
  4321. length = length % 1 * 256
  4322. final_blocks[j] = char(floor(length))
  4323. end
  4324. final_blocks = table_concat(final_blocks)
  4325. sha1_feed_64(H, final_blocks, 0, #final_blocks)
  4326. for j = 1, 5 do
  4327. H[j] = HEX(H[j])
  4328. end
  4329. H = table_concat(H)
  4330. end
  4331. return H
  4332. end
  4333. end
  4334. if message then
  4335. -- Actually perform calculations and return the SHA-1 digest of a message
  4336. return partial(message)()
  4337. else
  4338. -- Return function for chunk-by-chunk loading
  4339. -- User should feed every chunk of input data as single argument to this function and finally get SHA-1 digest by invoking this function without an argument
  4340. return partial
  4341. end
  4342. end
  4343. local function keccak(block_size_in_bytes, digest_size_in_bytes, is_SHAKE, message)
  4344. -- "block_size_in_bytes" is multiple of 8
  4345. if type(digest_size_in_bytes) ~= "number" then
  4346. -- arguments in SHAKE are swapped:
  4347. -- NIST FIPS 202 defines SHAKE(message,num_bits)
  4348. -- this module defines SHAKE(num_bytes,message)
  4349. -- it's easy to forget about this swap, hence the check
  4350. error("Argument 'digest_size_in_bytes' must be a number", 2)
  4351. end
  4352. -- Create an instance (private objects for current calculation)
  4353. local tail, lanes_lo, lanes_hi = "", create_array_of_lanes(), hi_factor_keccak == 0 and create_array_of_lanes()
  4354. local result
  4355. local function partial(message_part)
  4356. if message_part then
  4357. if tail then
  4358. local offs = 0
  4359. if tail ~= "" and #tail + #message_part >= block_size_in_bytes then
  4360. offs = block_size_in_bytes - #tail
  4361. keccak_feed(lanes_lo, lanes_hi, tail..sub(message_part, 1, offs), 0, block_size_in_bytes, block_size_in_bytes)
  4362. tail = ""
  4363. end
  4364. local size = #message_part - offs
  4365. local size_tail = size % block_size_in_bytes
  4366. keccak_feed(lanes_lo, lanes_hi, message_part, offs, size - size_tail, block_size_in_bytes)
  4367. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4368. return partial
  4369. else
  4370. error("Adding more chunks is not allowed after receiving the result", 2)
  4371. end
  4372. else
  4373. if tail then
  4374. -- append the following bits to the message: for usual SHA-3: 011(0*)1, for SHAKE: 11111(0*)1
  4375. local gap_start = is_SHAKE and 31 or 6
  4376. tail = tail..(#tail + 1 == block_size_in_bytes and char(gap_start + 128) or char(gap_start)..string_rep("\0", (-2 - #tail) % block_size_in_bytes).."\128")
  4377. keccak_feed(lanes_lo, lanes_hi, tail, 0, #tail, block_size_in_bytes)
  4378. tail = nil
  4379. local lanes_used = 0
  4380. local total_lanes = floor(block_size_in_bytes / 8)
  4381. local qwords = {}
  4382. local function get_next_qwords_of_digest(qwords_qty)
  4383. -- returns not more than 'qwords_qty' qwords ('qwords_qty' might be non-integer)
  4384. -- doesn't go across keccak-buffer boundary
  4385. -- block_size_in_bytes is a multiple of 8, so, keccak-buffer contains integer number of qwords
  4386. if lanes_used >= total_lanes then
  4387. keccak_feed(lanes_lo, lanes_hi, "\0\0\0\0\0\0\0\0", 0, 8, 8)
  4388. lanes_used = 0
  4389. end
  4390. qwords_qty = floor(math_min(qwords_qty, total_lanes - lanes_used))
  4391. if hi_factor_keccak ~= 0 then
  4392. for j = 1, qwords_qty do
  4393. qwords[j] = HEX64(lanes_lo[lanes_used + j - 1 + lanes_index_base])
  4394. end
  4395. else
  4396. for j = 1, qwords_qty do
  4397. qwords[j] = HEX(lanes_hi[lanes_used + j])..HEX(lanes_lo[lanes_used + j])
  4398. end
  4399. end
  4400. lanes_used = lanes_used + qwords_qty
  4401. return
  4402. gsub(table_concat(qwords, "", 1, qwords_qty), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"),
  4403. qwords_qty * 8
  4404. end
  4405. local parts = {} -- digest parts
  4406. local last_part, last_part_size = "", 0
  4407. local function get_next_part_of_digest(bytes_needed)
  4408. -- returns 'bytes_needed' bytes, for arbitrary integer 'bytes_needed'
  4409. bytes_needed = bytes_needed or 1
  4410. if bytes_needed <= last_part_size then
  4411. last_part_size = last_part_size - bytes_needed
  4412. local part_size_in_nibbles = bytes_needed * 2
  4413. local result = sub(last_part, 1, part_size_in_nibbles)
  4414. last_part = sub(last_part, part_size_in_nibbles + 1)
  4415. return result
  4416. end
  4417. local parts_qty = 0
  4418. if last_part_size > 0 then
  4419. parts_qty = 1
  4420. parts[parts_qty] = last_part
  4421. bytes_needed = bytes_needed - last_part_size
  4422. end
  4423. -- repeats until the length is enough
  4424. while bytes_needed >= 8 do
  4425. local next_part, next_part_size = get_next_qwords_of_digest(bytes_needed / 8)
  4426. parts_qty = parts_qty + 1
  4427. parts[parts_qty] = next_part
  4428. bytes_needed = bytes_needed - next_part_size
  4429. end
  4430. if bytes_needed > 0 then
  4431. last_part, last_part_size = get_next_qwords_of_digest(1)
  4432. parts_qty = parts_qty + 1
  4433. parts[parts_qty] = get_next_part_of_digest(bytes_needed)
  4434. else
  4435. last_part, last_part_size = "", 0
  4436. end
  4437. return table_concat(parts, "", 1, parts_qty)
  4438. end
  4439. if digest_size_in_bytes < 0 then
  4440. result = get_next_part_of_digest
  4441. else
  4442. result = get_next_part_of_digest(digest_size_in_bytes)
  4443. end
  4444. end
  4445. return result
  4446. end
  4447. end
  4448. if message then
  4449. -- Actually perform calculations and return the SHA-3 digest of a message
  4450. return partial(message)()
  4451. else
  4452. -- Return function for chunk-by-chunk loading
  4453. -- User should feed every chunk of input data as single argument to this function and finally get SHA-3 digest by invoking this function without an argument
  4454. return partial
  4455. end
  4456. end
  4457. local hex_to_bin, bin_to_hex, bin_to_base64, base64_to_bin
  4458. do
  4459. function hex_to_bin(hex_string)
  4460. return (gsub(hex_string, "%x%x",
  4461. function (hh)
  4462. return char(tonumber(hh, 16))
  4463. end
  4464. ))
  4465. end
  4466. function bin_to_hex(binary_string)
  4467. return (gsub(binary_string, ".",
  4468. function (c)
  4469. return string_format("%02x", byte(c))
  4470. end
  4471. ))
  4472. end
  4473. local base64_symbols = {
  4474. ['+'] = 62, ['-'] = 62, [62] = '+',
  4475. ['/'] = 63, ['_'] = 63, [63] = '/',
  4476. ['='] = -1, ['.'] = -1, [-1] = '='
  4477. }
  4478. local symbol_index = 0
  4479. for j, pair in ipairs{'AZ', 'az', '09'} do
  4480. for ascii = byte(pair), byte(pair, 2) do
  4481. local ch = char(ascii)
  4482. base64_symbols[ch] = symbol_index
  4483. base64_symbols[symbol_index] = ch
  4484. symbol_index = symbol_index + 1
  4485. end
  4486. end
  4487. function bin_to_base64(binary_string)
  4488. local result = {}
  4489. for pos = 1, #binary_string, 3 do
  4490. local c1, c2, c3, c4 = byte(sub(binary_string, pos, pos + 2)..'\0', 1, -1)
  4491. result[#result + 1] =
  4492. base64_symbols[floor(c1 / 4)]
  4493. ..base64_symbols[c1 % 4 * 16 + floor(c2 / 16)]
  4494. ..base64_symbols[c3 and c2 % 16 * 4 + floor(c3 / 64) or -1]
  4495. ..base64_symbols[c4 and c3 % 64 or -1]
  4496. end
  4497. return table_concat(result)
  4498. end
  4499. function base64_to_bin(base64_string)
  4500. local result, chars_qty = {}, 3
  4501. for pos, ch in gmatch(gsub(base64_string, '%s+', ''), '()(.)') do
  4502. local code = base64_symbols[ch]
  4503. if code < 0 then
  4504. chars_qty = chars_qty - 1
  4505. code = 0
  4506. end
  4507. local idx = pos % 4
  4508. if idx > 0 then
  4509. result[-idx] = code
  4510. else
  4511. local c1 = result[-1] * 4 + floor(result[-2] / 16)
  4512. local c2 = (result[-2] % 16) * 16 + floor(result[-3] / 4)
  4513. local c3 = (result[-3] % 4) * 64 + code
  4514. result[#result + 1] = sub(char(c1, c2, c3), 1, chars_qty)
  4515. end
  4516. end
  4517. return table_concat(result)
  4518. end
  4519. end
  4520. local block_size_for_HMAC -- this table will be initialized at the end of the module
  4521. local function pad_and_xor(str, result_length, byte_for_xor)
  4522. return gsub(str, ".",
  4523. function(c)
  4524. return char(XOR_BYTE(byte(c), byte_for_xor))
  4525. end
  4526. )..string_rep(char(byte_for_xor), result_length - #str)
  4527. end
  4528. local function hmac(hash_func, key, message)
  4529. -- Create an instance (private objects for current calculation)
  4530. local block_size = block_size_for_HMAC[hash_func]
  4531. if not block_size then
  4532. error("Unknown hash function", 2)
  4533. end
  4534. if #key > block_size then
  4535. key = hex_to_bin(hash_func(key))
  4536. end
  4537. local append = hash_func()(pad_and_xor(key, block_size, 0x36))
  4538. local result
  4539. local function partial(message_part)
  4540. if not message_part then
  4541. result = result or hash_func(pad_and_xor(key, block_size, 0x5C)..hex_to_bin(append()))
  4542. return result
  4543. elseif result then
  4544. error("Adding more chunks is not allowed after receiving the result", 2)
  4545. else
  4546. append(message_part)
  4547. return partial
  4548. end
  4549. end
  4550. if message then
  4551. -- Actually perform calculations and return the HMAC of a message
  4552. return partial(message)()
  4553. else
  4554. -- Return function for chunk-by-chunk loading of a message
  4555. -- User should feed every chunk of the message as single argument to this function and finally get HMAC by invoking this function without an argument
  4556. return partial
  4557. end
  4558. end
  4559. local function xor_blake2_salt(salt, letter, H_lo, H_hi)
  4560. -- salt: concatenation of "Salt"+"Personalization" fields
  4561. local max_size = letter == "s" and 16 or 32
  4562. local salt_size = #salt
  4563. if salt_size > max_size then
  4564. error(string_format("For BLAKE2%s/BLAKE2%sp/BLAKE2X%s the 'salt' parameter length must not exceed %d bytes", letter, letter, letter, max_size), 2)
  4565. end
  4566. if H_lo then
  4567. local offset, blake2_word_size, xor = 0, letter == "s" and 4 or 8, letter == "s" and XOR or XORA5
  4568. for j = 5, 4 + ceil(salt_size / blake2_word_size) do
  4569. local prev, last
  4570. for _ = 1, blake2_word_size, 4 do
  4571. offset = offset + 4
  4572. local a, b, c, d = byte(salt, offset - 3, offset)
  4573. local four_bytes = (((d or 0) * 256 + (c or 0)) * 256 + (b or 0)) * 256 + (a or 0)
  4574. prev, last = last, four_bytes
  4575. end
  4576. H_lo[j] = xor(H_lo[j], prev and last * hi_factor + prev or last)
  4577. if H_hi then
  4578. H_hi[j] = xor(H_hi[j], last)
  4579. end
  4580. end
  4581. end
  4582. end
  4583. local function blake2s(message, key, salt, digest_size_in_bytes, XOF_length, B2_offset)
  4584. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4585. -- key: (optional) binary string up to 32 bytes, by default empty string
  4586. -- salt: (optional) binary string up to 16 bytes, by default empty string
  4587. -- digest_size_in_bytes: (optional) integer from 1 to 32, by default 32
  4588. -- The last two parameters "XOF_length" and "B2_offset" are for internal use only, user must omit them (or pass nil)
  4589. digest_size_in_bytes = digest_size_in_bytes or 32
  4590. if digest_size_in_bytes < 1 or digest_size_in_bytes > 32 then
  4591. error("BLAKE2s digest length must be from 1 to 32 bytes", 2)
  4592. end
  4593. key = key or ""
  4594. local key_length = #key
  4595. if key_length > 32 then
  4596. error("BLAKE2s key length must not exceed 32 bytes", 2)
  4597. end
  4598. salt = salt or ""
  4599. local bytes_compressed, tail, H = 0.0, "", {unpack(sha2_H_hi)}
  4600. if B2_offset then
  4601. H[1] = XOR(H[1], digest_size_in_bytes)
  4602. H[2] = XOR(H[2], 0x20)
  4603. H[3] = XOR(H[3], B2_offset)
  4604. H[4] = XOR(H[4], 0x20000000 + XOF_length)
  4605. else
  4606. H[1] = XOR(H[1], 0x01010000 + key_length * 256 + digest_size_in_bytes)
  4607. if XOF_length then
  4608. H[4] = XOR(H[4], XOF_length)
  4609. end
  4610. end
  4611. if salt ~= "" then
  4612. xor_blake2_salt(salt, "s", H)
  4613. end
  4614. local function partial(message_part)
  4615. if message_part then
  4616. if tail then
  4617. local offs = 0
  4618. if tail ~= "" and #tail + #message_part > 64 then
  4619. offs = 64 - #tail
  4620. bytes_compressed = blake2s_feed_64(H, tail..sub(message_part, 1, offs), 0, 64, bytes_compressed)
  4621. tail = ""
  4622. end
  4623. local size = #message_part - offs
  4624. local size_tail = size > 0 and (size - 1) % 64 + 1 or 0
  4625. bytes_compressed = blake2s_feed_64(H, message_part, offs, size - size_tail, bytes_compressed)
  4626. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4627. return partial
  4628. else
  4629. error("Adding more chunks is not allowed after receiving the result", 2)
  4630. end
  4631. else
  4632. if tail then
  4633. if B2_offset then
  4634. blake2s_feed_64(H, nil, 0, 64, 0, 32)
  4635. else
  4636. blake2s_feed_64(H, tail..string_rep("\0", 64 - #tail), 0, 64, bytes_compressed, #tail)
  4637. end
  4638. tail = nil
  4639. if not XOF_length or B2_offset then
  4640. local max_reg = ceil(digest_size_in_bytes / 4)
  4641. for j = 1, max_reg do
  4642. H[j] = HEX(H[j])
  4643. end
  4644. H = sub(gsub(table_concat(H, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4645. end
  4646. end
  4647. return H
  4648. end
  4649. end
  4650. if key_length > 0 then
  4651. partial(key..string_rep("\0", 64 - key_length))
  4652. end
  4653. if B2_offset then
  4654. return partial()
  4655. elseif message then
  4656. -- Actually perform calculations and return the BLAKE2s digest of a message
  4657. return partial(message)()
  4658. else
  4659. -- Return function for chunk-by-chunk loading
  4660. -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2s digest by invoking this function without an argument
  4661. return partial
  4662. end
  4663. end
  4664. local function blake2b(message, key, salt, digest_size_in_bytes, XOF_length, B2_offset)
  4665. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4666. -- key: (optional) binary string up to 64 bytes, by default empty string
  4667. -- salt: (optional) binary string up to 32 bytes, by default empty string
  4668. -- digest_size_in_bytes: (optional) integer from 1 to 64, by default 64
  4669. -- The last two parameters "XOF_length" and "B2_offset" are for internal use only, user must omit them (or pass nil)
  4670. digest_size_in_bytes = floor(digest_size_in_bytes or 64)
  4671. if digest_size_in_bytes < 1 or digest_size_in_bytes > 64 then
  4672. error("BLAKE2b digest length must be from 1 to 64 bytes", 2)
  4673. end
  4674. key = key or ""
  4675. local key_length = #key
  4676. if key_length > 64 then
  4677. error("BLAKE2b key length must not exceed 64 bytes", 2)
  4678. end
  4679. salt = salt or ""
  4680. local bytes_compressed, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
  4681. if B2_offset then
  4682. if H_hi then
  4683. H_lo[1] = XORA5(H_lo[1], digest_size_in_bytes)
  4684. H_hi[1] = XORA5(H_hi[1], 0x40)
  4685. H_lo[2] = XORA5(H_lo[2], B2_offset)
  4686. H_hi[2] = XORA5(H_hi[2], XOF_length)
  4687. else
  4688. H_lo[1] = XORA5(H_lo[1], 0x40 * hi_factor + digest_size_in_bytes)
  4689. H_lo[2] = XORA5(H_lo[2], XOF_length * hi_factor + B2_offset)
  4690. end
  4691. H_lo[3] = XORA5(H_lo[3], 0x4000)
  4692. else
  4693. H_lo[1] = XORA5(H_lo[1], 0x01010000 + key_length * 256 + digest_size_in_bytes)
  4694. if XOF_length then
  4695. if H_hi then
  4696. H_hi[2] = XORA5(H_hi[2], XOF_length)
  4697. else
  4698. H_lo[2] = XORA5(H_lo[2], XOF_length * hi_factor)
  4699. end
  4700. end
  4701. end
  4702. if salt ~= "" then
  4703. xor_blake2_salt(salt, "b", H_lo, H_hi)
  4704. end
  4705. local function partial(message_part)
  4706. if message_part then
  4707. if tail then
  4708. local offs = 0
  4709. if tail ~= "" and #tail + #message_part > 128 then
  4710. offs = 128 - #tail
  4711. bytes_compressed = blake2b_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128, bytes_compressed)
  4712. tail = ""
  4713. end
  4714. local size = #message_part - offs
  4715. local size_tail = size > 0 and (size - 1) % 128 + 1 or 0
  4716. bytes_compressed = blake2b_feed_128(H_lo, H_hi, message_part, offs, size - size_tail, bytes_compressed)
  4717. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4718. return partial
  4719. else
  4720. error("Adding more chunks is not allowed after receiving the result", 2)
  4721. end
  4722. else
  4723. if tail then
  4724. if B2_offset then
  4725. blake2b_feed_128(H_lo, H_hi, nil, 0, 128, 0, 64)
  4726. else
  4727. blake2b_feed_128(H_lo, H_hi, tail..string_rep("\0", 128 - #tail), 0, 128, bytes_compressed, #tail)
  4728. end
  4729. tail = nil
  4730. if XOF_length and not B2_offset then
  4731. if H_hi then
  4732. for j = 8, 1, -1 do
  4733. H_lo[j*2] = H_hi[j]
  4734. H_lo[j*2-1] = H_lo[j]
  4735. end
  4736. return H_lo, 16
  4737. end
  4738. else
  4739. local max_reg = ceil(digest_size_in_bytes / 8)
  4740. if H_hi then
  4741. for j = 1, max_reg do
  4742. H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
  4743. end
  4744. else
  4745. for j = 1, max_reg do
  4746. H_lo[j] = HEX64(H_lo[j])
  4747. end
  4748. end
  4749. H_lo = sub(gsub(table_concat(H_lo, "", 1, max_reg), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4750. end
  4751. H_hi = nil
  4752. end
  4753. return H_lo
  4754. end
  4755. end
  4756. if key_length > 0 then
  4757. partial(key..string_rep("\0", 128 - key_length))
  4758. end
  4759. if B2_offset then
  4760. return partial()
  4761. elseif message then
  4762. -- Actually perform calculations and return the BLAKE2b digest of a message
  4763. return partial(message)()
  4764. else
  4765. -- Return function for chunk-by-chunk loading
  4766. -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2b digest by invoking this function without an argument
  4767. return partial
  4768. end
  4769. end
  4770. local function blake2sp(message, key, salt, digest_size_in_bytes)
  4771. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4772. -- key: (optional) binary string up to 32 bytes, by default empty string
  4773. -- salt: (optional) binary string up to 16 bytes, by default empty string
  4774. -- digest_size_in_bytes: (optional) integer from 1 to 32, by default 32
  4775. digest_size_in_bytes = digest_size_in_bytes or 32
  4776. if digest_size_in_bytes < 1 or digest_size_in_bytes > 32 then
  4777. error("BLAKE2sp digest length must be from 1 to 32 bytes", 2)
  4778. end
  4779. key = key or ""
  4780. local key_length = #key
  4781. if key_length > 32 then
  4782. error("BLAKE2sp key length must not exceed 32 bytes", 2)
  4783. end
  4784. salt = salt or ""
  4785. local instances, length, first_dword_of_parameter_block, result = {}, 0.0, 0x02080000 + key_length * 256 + digest_size_in_bytes
  4786. for j = 1, 8 do
  4787. local bytes_compressed, tail, H = 0.0, "", {unpack(sha2_H_hi)}
  4788. instances[j] = {bytes_compressed, tail, H}
  4789. H[1] = XOR(H[1], first_dword_of_parameter_block)
  4790. H[3] = XOR(H[3], j-1)
  4791. H[4] = XOR(H[4], 0x20000000)
  4792. if salt ~= "" then
  4793. xor_blake2_salt(salt, "s", H)
  4794. end
  4795. end
  4796. local function partial(message_part)
  4797. if message_part then
  4798. if instances then
  4799. local from = 0
  4800. while true do
  4801. local to = math_min(from + 64 - length % 64, #message_part)
  4802. if to > from then
  4803. local inst = instances[floor(length / 64) % 8 + 1]
  4804. local part = sub(message_part, from + 1, to)
  4805. length, from = length + to - from, to
  4806. local bytes_compressed, tail = inst[1], inst[2]
  4807. if #tail < 64 then
  4808. tail = tail..part
  4809. else
  4810. local H = inst[3]
  4811. bytes_compressed = blake2s_feed_64(H, tail, 0, 64, bytes_compressed)
  4812. tail = part
  4813. end
  4814. inst[1], inst[2] = bytes_compressed, tail
  4815. else
  4816. break
  4817. end
  4818. end
  4819. return partial
  4820. else
  4821. error("Adding more chunks is not allowed after receiving the result", 2)
  4822. end
  4823. else
  4824. if instances then
  4825. local root_H = {unpack(sha2_H_hi)}
  4826. root_H[1] = XOR(root_H[1], first_dword_of_parameter_block)
  4827. root_H[4] = XOR(root_H[4], 0x20010000)
  4828. if salt ~= "" then
  4829. xor_blake2_salt(salt, "s", root_H)
  4830. end
  4831. for j = 1, 8 do
  4832. local inst = instances[j]
  4833. local bytes_compressed, tail, H = inst[1], inst[2], inst[3]
  4834. blake2s_feed_64(H, tail..string_rep("\0", 64 - #tail), 0, 64, bytes_compressed, #tail, j == 8)
  4835. if j % 2 == 0 then
  4836. local index = 0
  4837. for k = j - 1, j do
  4838. local inst = instances[k]
  4839. local H = inst[3]
  4840. for i = 1, 8 do
  4841. index = index + 1
  4842. common_W_blake2s[index] = H[i]
  4843. end
  4844. end
  4845. blake2s_feed_64(root_H, nil, 0, 64, 64 * (j/2 - 1), j == 8 and 64, j == 8)
  4846. end
  4847. end
  4848. instances = nil
  4849. local max_reg = ceil(digest_size_in_bytes / 4)
  4850. for j = 1, max_reg do
  4851. root_H[j] = HEX(root_H[j])
  4852. end
  4853. result = sub(gsub(table_concat(root_H, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4854. end
  4855. return result
  4856. end
  4857. end
  4858. if key_length > 0 then
  4859. key = key..string_rep("\0", 64 - key_length)
  4860. for j = 1, 8 do
  4861. partial(key)
  4862. end
  4863. end
  4864. if message then
  4865. -- Actually perform calculations and return the BLAKE2sp digest of a message
  4866. return partial(message)()
  4867. else
  4868. -- Return function for chunk-by-chunk loading
  4869. -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2sp digest by invoking this function without an argument
  4870. return partial
  4871. end
  4872. end
  4873. local function blake2bp(message, key, salt, digest_size_in_bytes)
  4874. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4875. -- key: (optional) binary string up to 64 bytes, by default empty string
  4876. -- salt: (optional) binary string up to 32 bytes, by default empty string
  4877. -- digest_size_in_bytes: (optional) integer from 1 to 64, by default 64
  4878. digest_size_in_bytes = digest_size_in_bytes or 64
  4879. if digest_size_in_bytes < 1 or digest_size_in_bytes > 64 then
  4880. error("BLAKE2bp digest length must be from 1 to 64 bytes", 2)
  4881. end
  4882. key = key or ""
  4883. local key_length = #key
  4884. if key_length > 64 then
  4885. error("BLAKE2bp key length must not exceed 64 bytes", 2)
  4886. end
  4887. salt = salt or ""
  4888. local instances, length, first_dword_of_parameter_block, result = {}, 0.0, 0x02040000 + key_length * 256 + digest_size_in_bytes
  4889. for j = 1, 4 do
  4890. local bytes_compressed, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
  4891. instances[j] = {bytes_compressed, tail, H_lo, H_hi}
  4892. H_lo[1] = XORA5(H_lo[1], first_dword_of_parameter_block)
  4893. H_lo[2] = XORA5(H_lo[2], j-1)
  4894. H_lo[3] = XORA5(H_lo[3], 0x4000)
  4895. if salt ~= "" then
  4896. xor_blake2_salt(salt, "b", H_lo, H_hi)
  4897. end
  4898. end
  4899. local function partial(message_part)
  4900. if message_part then
  4901. if instances then
  4902. local from = 0
  4903. while true do
  4904. local to = math_min(from + 128 - length % 128, #message_part)
  4905. if to > from then
  4906. local inst = instances[floor(length / 128) % 4 + 1]
  4907. local part = sub(message_part, from + 1, to)
  4908. length, from = length + to - from, to
  4909. local bytes_compressed, tail = inst[1], inst[2]
  4910. if #tail < 128 then
  4911. tail = tail..part
  4912. else
  4913. local H_lo, H_hi = inst[3], inst[4]
  4914. bytes_compressed = blake2b_feed_128(H_lo, H_hi, tail, 0, 128, bytes_compressed)
  4915. tail = part
  4916. end
  4917. inst[1], inst[2] = bytes_compressed, tail
  4918. else
  4919. break
  4920. end
  4921. end
  4922. return partial
  4923. else
  4924. error("Adding more chunks is not allowed after receiving the result", 2)
  4925. end
  4926. else
  4927. if instances then
  4928. local root_H_lo, root_H_hi = {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
  4929. root_H_lo[1] = XORA5(root_H_lo[1], first_dword_of_parameter_block)
  4930. root_H_lo[3] = XORA5(root_H_lo[3], 0x4001)
  4931. if salt ~= "" then
  4932. xor_blake2_salt(salt, "b", root_H_lo, root_H_hi)
  4933. end
  4934. for j = 1, 4 do
  4935. local inst = instances[j]
  4936. local bytes_compressed, tail, H_lo, H_hi = inst[1], inst[2], inst[3], inst[4]
  4937. blake2b_feed_128(H_lo, H_hi, tail..string_rep("\0", 128 - #tail), 0, 128, bytes_compressed, #tail, j == 4)
  4938. if j % 2 == 0 then
  4939. local index = 0
  4940. for k = j - 1, j do
  4941. local inst = instances[k]
  4942. local H_lo, H_hi = inst[3], inst[4]
  4943. for i = 1, 8 do
  4944. index = index + 1
  4945. common_W_blake2b[index] = H_lo[i]
  4946. if H_hi then
  4947. index = index + 1
  4948. common_W_blake2b[index] = H_hi[i]
  4949. end
  4950. end
  4951. end
  4952. blake2b_feed_128(root_H_lo, root_H_hi, nil, 0, 128, 128 * (j/2 - 1), j == 4 and 128, j == 4)
  4953. end
  4954. end
  4955. instances = nil
  4956. local max_reg = ceil(digest_size_in_bytes / 8)
  4957. if HEX64 then
  4958. for j = 1, max_reg do
  4959. root_H_lo[j] = HEX64(root_H_lo[j])
  4960. end
  4961. else
  4962. for j = 1, max_reg do
  4963. root_H_lo[j] = HEX(root_H_hi[j])..HEX(root_H_lo[j])
  4964. end
  4965. end
  4966. result = sub(gsub(table_concat(root_H_lo, "", 1, max_reg), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4967. end
  4968. return result
  4969. end
  4970. end
  4971. if key_length > 0 then
  4972. key = key..string_rep("\0", 128 - key_length)
  4973. for j = 1, 4 do
  4974. partial(key)
  4975. end
  4976. end
  4977. if message then
  4978. -- Actually perform calculations and return the BLAKE2bp digest of a message
  4979. return partial(message)()
  4980. else
  4981. -- Return function for chunk-by-chunk loading
  4982. -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2bp digest by invoking this function without an argument
  4983. return partial
  4984. end
  4985. end
  4986. local function blake2x(inner_func, inner_func_letter, common_W_blake2, block_size, digest_size_in_bytes, message, key, salt)
  4987. local XOF_digest_length_limit, XOF_digest_length, chunk_by_chunk_output = 2^(block_size / 2) - 1
  4988. if digest_size_in_bytes == -1 then -- infinite digest
  4989. digest_size_in_bytes = math_huge
  4990. XOF_digest_length = floor(XOF_digest_length_limit)
  4991. chunk_by_chunk_output = true
  4992. else
  4993. if digest_size_in_bytes < 0 then
  4994. digest_size_in_bytes = -1.0 * digest_size_in_bytes
  4995. chunk_by_chunk_output = true
  4996. end
  4997. XOF_digest_length = floor(digest_size_in_bytes)
  4998. if XOF_digest_length >= XOF_digest_length_limit then
  4999. error("Requested digest is too long. BLAKE2X"..inner_func_letter.." finite digest is limited by (2^"..floor(block_size / 2)..")-2 bytes. Hint: you can generate infinite digest.", 2)
  5000. end
  5001. end
  5002. salt = salt or ""
  5003. if salt ~= "" then
  5004. xor_blake2_salt(salt, inner_func_letter) -- don't xor, only check the size of salt
  5005. end
  5006. local inner_partial = inner_func(nil, key, salt, nil, XOF_digest_length)
  5007. local result
  5008. local function partial(message_part)
  5009. if message_part then
  5010. if inner_partial then
  5011. inner_partial(message_part)
  5012. return partial
  5013. else
  5014. error("Adding more chunks is not allowed after receiving the result", 2)
  5015. end
  5016. else
  5017. if inner_partial then
  5018. local half_W, half_W_size = inner_partial()
  5019. half_W_size, inner_partial = half_W_size or 8
  5020. local function get_hash_block(block_no)
  5021. -- block_no = 0...(2^32-1)
  5022. local size = math_min(block_size, digest_size_in_bytes - block_no * block_size)
  5023. if size <= 0 then
  5024. return ""
  5025. end
  5026. for j = 1, half_W_size do
  5027. common_W_blake2[j] = half_W[j]
  5028. end
  5029. for j = half_W_size + 1, 2 * half_W_size do
  5030. common_W_blake2[j] = 0
  5031. end
  5032. return inner_func(nil, nil, salt, size, XOF_digest_length, floor(block_no))
  5033. end
  5034. local hash = {}
  5035. if chunk_by_chunk_output then
  5036. local pos, period, cached_block_no, cached_block = 0, block_size * 2^32
  5037. local function get_next_part_of_digest(arg1, arg2)
  5038. if arg1 == "seek" then
  5039. -- Usage #1: get_next_part_of_digest("seek", new_pos)
  5040. pos = arg2 % period
  5041. else
  5042. -- Usage #2: hex_string = get_next_part_of_digest(size)
  5043. local size, index = arg1 or 1, 0
  5044. while size > 0 do
  5045. local block_offset = pos % block_size
  5046. local block_no = (pos - block_offset) / block_size
  5047. local part_size = math_min(size, block_size - block_offset)
  5048. if cached_block_no ~= block_no then
  5049. cached_block_no = block_no
  5050. cached_block = get_hash_block(block_no)
  5051. end
  5052. index = index + 1
  5053. hash[index] = sub(cached_block, block_offset * 2 + 1, (block_offset + part_size) * 2)
  5054. size = size - part_size
  5055. pos = (pos + part_size) % period
  5056. end
  5057. return table_concat(hash, "", 1, index)
  5058. end
  5059. end
  5060. result = get_next_part_of_digest
  5061. else
  5062. for j = 1.0, ceil(digest_size_in_bytes / block_size) do
  5063. hash[j] = get_hash_block(j - 1.0)
  5064. end
  5065. result = table_concat(hash)
  5066. end
  5067. end
  5068. return result
  5069. end
  5070. end
  5071. if message then
  5072. -- Actually perform calculations and return the BLAKE2X digest of a message
  5073. return partial(message)()
  5074. else
  5075. -- Return function for chunk-by-chunk loading
  5076. -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2X digest by invoking this function without an argument
  5077. return partial
  5078. end
  5079. end
  5080. local function blake2xs(digest_size_in_bytes, message, key, salt)
  5081. -- digest_size_in_bytes:
  5082. -- 0..65534 = get finite digest as single Lua string
  5083. -- (-1) = get infinite digest in "chunk-by-chunk" output mode
  5084. -- (-2)..(-65534) = get finite digest in "chunk-by-chunk" output mode
  5085. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  5086. -- key: (optional) binary string up to 32 bytes, by default empty string
  5087. -- salt: (optional) binary string up to 16 bytes, by default empty string
  5088. return blake2x(blake2s, "s", common_W_blake2s, 32, digest_size_in_bytes, message, key, salt)
  5089. end
  5090. local function blake2xb(digest_size_in_bytes, message, key, salt)
  5091. -- digest_size_in_bytes:
  5092. -- 0..4294967294 = get finite digest as single Lua string
  5093. -- (-1) = get infinite digest in "chunk-by-chunk" output mode
  5094. -- (-2)..(-4294967294) = get finite digest in "chunk-by-chunk" output mode
  5095. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  5096. -- key: (optional) binary string up to 64 bytes, by default empty string
  5097. -- salt: (optional) binary string up to 32 bytes, by default empty string
  5098. return blake2x(blake2b, "b", common_W_blake2b, 64, digest_size_in_bytes, message, key, salt)
  5099. end
  5100. local function blake3(message, key, digest_size_in_bytes, message_flags, K, return_array)
  5101. -- message: binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  5102. -- key: (optional) binary string up to 32 bytes, by default empty string
  5103. -- digest_size_in_bytes: (optional) by default 32
  5104. -- 0,1,2,3,4,... = get finite digest as single Lua string
  5105. -- (-1) = get infinite digest in "chunk-by-chunk" output mode
  5106. -- -2,-3,-4,... = get finite digest in "chunk-by-chunk" output mode
  5107. -- The last three parameters "message_flags", "K" and "return_array" are for internal use only, user must omit them (or pass nil)
  5108. key = key or ""
  5109. digest_size_in_bytes = digest_size_in_bytes or 32
  5110. message_flags = message_flags or 0
  5111. if key == "" then
  5112. K = K or sha2_H_hi
  5113. else
  5114. local key_length = #key
  5115. if key_length > 32 then
  5116. error("BLAKE3 key length must not exceed 32 bytes", 2)
  5117. end
  5118. key = key..string_rep("\0", 32 - key_length)
  5119. K = {}
  5120. for j = 1, 8 do
  5121. local a, b, c, d = byte(key, 4*j-3, 4*j)
  5122. K[j] = ((d * 256 + c) * 256 + b) * 256 + a
  5123. end
  5124. message_flags = message_flags + 16 -- flag:KEYED_HASH
  5125. end
  5126. local tail, H, chunk_index, blocks_in_chunk, stack_size, stack = "", {}, 0, 0, 0, {}
  5127. local final_H_in, final_block_length, chunk_by_chunk_output, result, wide_output = K
  5128. local final_compression_flags = 3 -- flags:CHUNK_START,CHUNK_END
  5129. local function feed_blocks(str, offs, size)
  5130. -- size >= 0, size is multiple of 64
  5131. while size > 0 do
  5132. local part_size_in_blocks, block_flags, H_in = 1, 0, H
  5133. if blocks_in_chunk == 0 then
  5134. block_flags = 1 -- flag:CHUNK_START
  5135. H_in, final_H_in = K, H
  5136. final_compression_flags = 2 -- flag:CHUNK_END
  5137. elseif blocks_in_chunk == 15 then
  5138. block_flags = 2 -- flag:CHUNK_END
  5139. final_compression_flags = 3 -- flags:CHUNK_START,CHUNK_END
  5140. final_H_in = K
  5141. else
  5142. part_size_in_blocks = math_min(size / 64, 15 - blocks_in_chunk)
  5143. end
  5144. local part_size = part_size_in_blocks * 64
  5145. blake3_feed_64(str, offs, part_size, message_flags + block_flags, chunk_index, H_in, H)
  5146. offs, size = offs + part_size, size - part_size
  5147. blocks_in_chunk = (blocks_in_chunk + part_size_in_blocks) % 16
  5148. if blocks_in_chunk == 0 then
  5149. -- completing the currect chunk
  5150. chunk_index = chunk_index + 1.0
  5151. local divider = 2.0
  5152. while chunk_index % divider == 0 do
  5153. divider = divider * 2.0
  5154. stack_size = stack_size - 8
  5155. for j = 1, 8 do
  5156. common_W_blake2s[j] = stack[stack_size + j]
  5157. end
  5158. for j = 1, 8 do
  5159. common_W_blake2s[j + 8] = H[j]
  5160. end
  5161. blake3_feed_64(nil, 0, 64, message_flags + 4, 0, K, H) -- flag:PARENT
  5162. end
  5163. for j = 1, 8 do
  5164. stack[stack_size + j] = H[j]
  5165. end
  5166. stack_size = stack_size + 8
  5167. end
  5168. end
  5169. end
  5170. local function get_hash_block(block_no)
  5171. local size = math_min(64, digest_size_in_bytes - block_no * 64)
  5172. if block_no < 0 or size <= 0 then
  5173. return ""
  5174. end
  5175. if chunk_by_chunk_output then
  5176. for j = 1, 16 do
  5177. common_W_blake2s[j] = stack[j + 16]
  5178. end
  5179. end
  5180. blake3_feed_64(nil, 0, 64, final_compression_flags, block_no, final_H_in, stack, wide_output, final_block_length)
  5181. if return_array then
  5182. return stack
  5183. end
  5184. local max_reg = ceil(size / 4)
  5185. for j = 1, max_reg do
  5186. stack[j] = HEX(stack[j])
  5187. end
  5188. return sub(gsub(table_concat(stack, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, size * 2)
  5189. end
  5190. local function partial(message_part)
  5191. if message_part then
  5192. if tail then
  5193. local offs = 0
  5194. if tail ~= "" and #tail + #message_part > 64 then
  5195. offs = 64 - #tail
  5196. feed_blocks(tail..sub(message_part, 1, offs), 0, 64)
  5197. tail = ""
  5198. end
  5199. local size = #message_part - offs
  5200. local size_tail = size > 0 and (size - 1) % 64 + 1 or 0
  5201. feed_blocks(message_part, offs, size - size_tail)
  5202. tail = tail..sub(message_part, #message_part + 1 - size_tail)
  5203. return partial
  5204. else
  5205. error("Adding more chunks is not allowed after receiving the result", 2)
  5206. end
  5207. else
  5208. if tail then
  5209. final_block_length = #tail
  5210. tail = tail..string_rep("\0", 64 - #tail)
  5211. if common_W_blake2s[0] then
  5212. for j = 1, 16 do
  5213. local a, b, c, d = byte(tail, 4*j-3, 4*j)
  5214. common_W_blake2s[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  5215. end
  5216. else
  5217. for j = 1, 16 do
  5218. local a, b, c, d = byte(tail, 4*j-3, 4*j)
  5219. common_W_blake2s[j] = ((d * 256 + c) * 256 + b) * 256 + a
  5220. end
  5221. end
  5222. tail = nil
  5223. for stack_size = stack_size - 8, 0, -8 do
  5224. blake3_feed_64(nil, 0, 64, message_flags + final_compression_flags, chunk_index, final_H_in, H, nil, final_block_length)
  5225. chunk_index, final_block_length, final_H_in, final_compression_flags = 0, 64, K, 4 -- flag:PARENT
  5226. for j = 1, 8 do
  5227. common_W_blake2s[j] = stack[stack_size + j]
  5228. end
  5229. for j = 1, 8 do
  5230. common_W_blake2s[j + 8] = H[j]
  5231. end
  5232. end
  5233. final_compression_flags = message_flags + final_compression_flags + 8 -- flag:ROOT
  5234. if digest_size_in_bytes < 0 then
  5235. if digest_size_in_bytes == -1 then -- infinite digest
  5236. digest_size_in_bytes = math_huge
  5237. else
  5238. digest_size_in_bytes = -1.0 * digest_size_in_bytes
  5239. end
  5240. chunk_by_chunk_output = true
  5241. for j = 1, 16 do
  5242. stack[j + 16] = common_W_blake2s[j]
  5243. end
  5244. end
  5245. digest_size_in_bytes = math_min(2^53, digest_size_in_bytes)
  5246. wide_output = digest_size_in_bytes > 32
  5247. if chunk_by_chunk_output then
  5248. local pos, cached_block_no, cached_block = 0.0
  5249. local function get_next_part_of_digest(arg1, arg2)
  5250. if arg1 == "seek" then
  5251. -- Usage #1: get_next_part_of_digest("seek", new_pos)
  5252. pos = arg2 * 1.0
  5253. else
  5254. -- Usage #2: hex_string = get_next_part_of_digest(size)
  5255. local size, index = arg1 or 1, 32
  5256. while size > 0 do
  5257. local block_offset = pos % 64
  5258. local block_no = (pos - block_offset) / 64
  5259. local part_size = math_min(size, 64 - block_offset)
  5260. if cached_block_no ~= block_no then
  5261. cached_block_no = block_no
  5262. cached_block = get_hash_block(block_no)
  5263. end
  5264. index = index + 1
  5265. stack[index] = sub(cached_block, block_offset * 2 + 1, (block_offset + part_size) * 2)
  5266. size = size - part_size
  5267. pos = pos + part_size
  5268. end
  5269. return table_concat(stack, "", 33, index)
  5270. end
  5271. end
  5272. result = get_next_part_of_digest
  5273. elseif digest_size_in_bytes <= 64 then
  5274. result = get_hash_block(0)
  5275. else
  5276. local last_block_no = ceil(digest_size_in_bytes / 64) - 1
  5277. for block_no = 0.0, last_block_no do
  5278. stack[33 + block_no] = get_hash_block(block_no)
  5279. end
  5280. result = table_concat(stack, "", 33, 33 + last_block_no)
  5281. end
  5282. end
  5283. return result
  5284. end
  5285. end
  5286. if message then
  5287. -- Actually perform calculations and return the BLAKE3 digest of a message
  5288. return partial(message)()
  5289. else
  5290. -- Return function for chunk-by-chunk loading
  5291. -- User should feed every chunk of input data as single argument to this function and finally get BLAKE3 digest by invoking this function without an argument
  5292. return partial
  5293. end
  5294. end
  5295. local function blake3_derive_key(key_material, context_string, derived_key_size_in_bytes)
  5296. -- key_material: (string) your source of entropy to derive a key from (for example, it can be a master password)
  5297. -- set to nil for feeding the key material in "chunk-by-chunk" input mode
  5298. -- context_string: (string) unique description of the derived key
  5299. -- digest_size_in_bytes: (optional) by default 32
  5300. -- 0,1,2,3,4,... = get finite derived key as single Lua string
  5301. -- (-1) = get infinite derived key in "chunk-by-chunk" output mode
  5302. -- -2,-3,-4,... = get finite derived key in "chunk-by-chunk" output mode
  5303. if type(context_string) ~= "string" then
  5304. error("'context_string' parameter must be a Lua string", 2)
  5305. end
  5306. local K = blake3(context_string, nil, nil, 32, nil, true) -- flag:DERIVE_KEY_CONTEXT
  5307. return blake3(key_material, nil, derived_key_size_in_bytes, 64, K) -- flag:DERIVE_KEY_MATERIAL
  5308. end
  5309. local sha = {
  5310. md5 = md5, -- MD5
  5311. sha1 = sha1, -- SHA-1
  5312. -- SHA-2 hash functions:
  5313. sha224 = function (message) return sha256ext(224, message) end, -- SHA-224
  5314. sha256 = function (message) return sha256ext(256, message) end, -- SHA-256
  5315. sha512_224 = function (message) return sha512ext(224, message) end, -- SHA-512/224
  5316. sha512_256 = function (message) return sha512ext(256, message) end, -- SHA-512/256
  5317. sha384 = function (message) return sha512ext(384, message) end, -- SHA-384
  5318. sha512 = function (message) return sha512ext(512, message) end, -- SHA-512
  5319. -- SHA-3 hash functions:
  5320. sha3_224 = function (message) return keccak((1600 - 2 * 224) / 8, 224 / 8, false, message) end, -- SHA3-224
  5321. sha3_256 = function (message) return keccak((1600 - 2 * 256) / 8, 256 / 8, false, message) end, -- SHA3-256
  5322. sha3_384 = function (message) return keccak((1600 - 2 * 384) / 8, 384 / 8, false, message) end, -- SHA3-384
  5323. sha3_512 = function (message) return keccak((1600 - 2 * 512) / 8, 512 / 8, false, message) end, -- SHA3-512
  5324. shake128 = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 128) / 8, digest_size_in_bytes, true, message) end, -- SHAKE128
  5325. shake256 = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 256) / 8, digest_size_in_bytes, true, message) end, -- SHAKE256
  5326. -- HMAC:
  5327. hmac = hmac, -- HMAC(hash_func, key, message) is applicable to any hash function from this module except SHAKE* and BLAKE*
  5328. -- misc utilities:
  5329. hex_to_bin = hex_to_bin, -- converts hexadecimal representation to binary string
  5330. bin_to_hex = bin_to_hex, -- converts binary string to hexadecimal representation
  5331. base64_to_bin = base64_to_bin, -- converts base64 representation to binary string
  5332. bin_to_base64 = bin_to_base64, -- converts binary string to base64 representation
  5333. -- old style names for backward compatibility:
  5334. hex2bin = hex_to_bin,
  5335. bin2hex = bin_to_hex,
  5336. base642bin = base64_to_bin,
  5337. bin2base64 = bin_to_base64,
  5338. -- BLAKE2 hash functions:
  5339. blake2b = blake2b, -- BLAKE2b (message, key, salt, digest_size_in_bytes)
  5340. blake2s = blake2s, -- BLAKE2s (message, key, salt, digest_size_in_bytes)
  5341. blake2bp = blake2bp, -- BLAKE2bp(message, key, salt, digest_size_in_bytes)
  5342. blake2sp = blake2sp, -- BLAKE2sp(message, key, salt, digest_size_in_bytes)
  5343. blake2xb = blake2xb, -- BLAKE2Xb(digest_size_in_bytes, message, key, salt)
  5344. blake2xs = blake2xs, -- BLAKE2Xs(digest_size_in_bytes, message, key, salt)
  5345. -- BLAKE2 aliases:
  5346. blake2 = blake2b,
  5347. blake2b_160 = function (message, key, salt) return blake2b(message, key, salt, 20) end, -- BLAKE2b-160
  5348. blake2b_256 = function (message, key, salt) return blake2b(message, key, salt, 32) end, -- BLAKE2b-256
  5349. blake2b_384 = function (message, key, salt) return blake2b(message, key, salt, 48) end, -- BLAKE2b-384
  5350. blake2b_512 = blake2b, -- 64 -- BLAKE2b-512
  5351. blake2s_128 = function (message, key, salt) return blake2s(message, key, salt, 16) end, -- BLAKE2s-128
  5352. blake2s_160 = function (message, key, salt) return blake2s(message, key, salt, 20) end, -- BLAKE2s-160
  5353. blake2s_224 = function (message, key, salt) return blake2s(message, key, salt, 28) end, -- BLAKE2s-224
  5354. blake2s_256 = blake2s, -- 32 -- BLAKE2s-256
  5355. -- BLAKE3 hash function
  5356. blake3 = blake3, -- BLAKE3 (message, key, digest_size_in_bytes)
  5357. blake3_derive_key = blake3_derive_key, -- BLAKE3_KDF(key_material, context_string, derived_key_size_in_bytes)
  5358. }
  5359. block_size_for_HMAC = {
  5360. [sha.md5] = 64,
  5361. [sha.sha1] = 64,
  5362. [sha.sha224] = 64,
  5363. [sha.sha256] = 64,
  5364. [sha.sha512_224] = 128,
  5365. [sha.sha512_256] = 128,
  5366. [sha.sha384] = 128,
  5367. [sha.sha512] = 128,
  5368. [sha.sha3_224] = 144, -- (1600 - 2 * 224) / 8
  5369. [sha.sha3_256] = 136, -- (1600 - 2 * 256) / 8
  5370. [sha.sha3_384] = 104, -- (1600 - 2 * 384) / 8
  5371. [sha.sha3_512] = 72, -- (1600 - 2 * 512) / 8
  5372. }
  5373. return sha