From 9f26e53798f5f7b2995bafdb57fe970eb37cd439 Mon Sep 17 00:00:00 2001 From: Thomas Lynch Date: Wed, 24 Nov 2021 05:17:06 +1100 Subject: [PATCH] combine POW and captcha into one --- .gitignore | 1 - docker-compose.yml | 12 +- haproxy/domains_under_ddos.txt | 2 - haproxy/haproxy.cfg | 14 +- js/sha1.js | 37 + js/worker.js | 27 + src/libs/sha.lua | 2962 ++++++++++++++++++++++++++++++++ src/libs/utils.lua | 25 +- src/scripts/hcaptcha.lua | 34 +- src/scripts/register.lua | 3 +- 10 files changed, 3088 insertions(+), 29 deletions(-) delete mode 100644 .gitignore delete mode 100644 haproxy/domains_under_ddos.txt create mode 100644 js/sha1.js create mode 100644 js/worker.js create mode 100644 src/libs/sha.lua diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 1120be9..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -docker-compose.yml diff --git a/docker-compose.yml b/docker-compose.yml index 76a9077..006393b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,10 +8,14 @@ services: - 80:80 volumes: - ./haproxy/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg - - ./haproxy/domains_under_ddos.txt:/usr/local/etc/haproxy/domains_under_ddos.txt - ./src/scripts/:/usr/local/etc/haproxy/scripts/ - ./src/libs/:/usr/local/etc/haproxy/libs/ environment: - - HCAPTCHA_SECRET=0x0000000000000000000000000000000000000000 - - HCAPTCHA_SITEKEY=10000000-ffff-ffff-ffff-000000000001 - - COOKIE_SECRET=changeme + - HCAPTCHA_SECRET= + - HCAPTCHA_SITEKEY= + - CAPTCHA_COOKIE_SECRET= + - POW_COOKIE_SECRET= + nginx: + image: "nginx:latest" + volumes: + - ./js:/usr/share/nginx/html diff --git a/haproxy/domains_under_ddos.txt b/haproxy/domains_under_ddos.txt deleted file mode 100644 index b735d48..0000000 --- a/haproxy/domains_under_ddos.txt +++ /dev/null @@ -1,2 +0,0 @@ -poge.com -domain.com diff --git a/haproxy/haproxy.cfg b/haproxy/haproxy.cfg index f8109c9..269547f 100644 --- a/haproxy/haproxy.cfg +++ b/haproxy/haproxy.cfg @@ -15,13 +15,19 @@ frontend http-in bind *:80 acl ddos_mode_enabled hdr_cnt(xr3la1rfFc) eq 0 - acl domain_under_ddos hdr(host) -i -f /usr/local/etc/haproxy/domains_under_ddos.txt + acl pow_passed var(txn.pow_passed) -m bool acl captcha_passed var(txn.captcha_passed) -m bool + acl on_captcha_url path -m beg /bot-check + acl is_excluded path_end -i .js .ico + + use_backend servers if is_excluded - http-request use-service lua.hcaptcha-view if on_captcha_url - http-request lua.hcaptcha-redirect if !on_captcha_url ddos_mode_enabled OR domain_under_ddos - http-request redirect location /bot-check?%[capture.req.uri] code 302 if !captcha_passed !on_captcha_url ddos_mode_enabled OR domain_under_ddos + http-request use-service lua.hcaptcha-view if on_captcha_url !is_excluded + http-request lua.hcaptcha-check if !is_excluded !on_captcha_url ddos_mode_enabled + http-request lua.pow-check if !is_excluded !on_captcha_url ddos_mode_enabled + + http-request redirect location /bot-check?%[capture.req.uri] code 302 if !captcha_passed !on_captcha_url ddos_mode_enabled !is_excluded OR !pow_passed !on_captcha_url ddos_mode_enabled !is_excluded default_backend servers diff --git a/js/sha1.js b/js/sha1.js new file mode 100644 index 0000000..565d661 --- /dev/null +++ b/js/sha1.js @@ -0,0 +1,37 @@ +if (window.Worker) { + const challenge = document.querySelector('[data-pow]').dataset.pow; + const difficulty = 0; + const start = Date.now(); + const threads = window.navigator.hardwareConcurrency-1; + let finished = false; + const messageHandler = (e) => { + if (finished) { return; } + finished = true; + workers.forEach(w => w.terminate()); + const [workerId, answer] = e.data; + console.log('Worker', workerId, 'returned answer', answer); + document.cookie='z_ddos_pow='+answer+';expires=Thu, 31-Dec-37 23:55:55 GMT; path=/;'; + const dummyTime = 4000 - (Date.now()-start); + window.setTimeout(function(){ + //still dumm delay when captcha is enabled. might remove, but nobody solves captchas that fast. it feels nicer. + const submitButton = document.querySelector('input[type=submit]') + if (submitButton) { + //button is shown only if captcha is enabled + submitButton.disabled = false; + submitButton.value = 'Submit'; + } else { + window.location=location.search.slice(1)+location.hash;; + } + }, dummyTime); + } + const workers = []; + for (let i = 0; i < threads; i++) { + const shaWorker = new Worker('/worker.js'); + shaWorker.onmessage = messageHandler; + workers.push(shaWorker); + } + workers.forEach((w, i) => w.postMessage([challenge, difficulty, i, threads])); +} else { + console.warn('No webworker support, using legacy method in main/UI thread!'); + console.time('challenge');var _0x7040=['2429IsdGeN','541usNEzS','2gJvpao','441OQYeOS','dataset','1540379PlFpGF','2268690VWQeIz','1246372GHrGxp','querySelector','3805kSDNgG','248730Xagbxx','401870MFFOOz'];var _0x1f5c=function(_0x9e3176,_0x4e853a){_0x9e3176=_0x9e3176-0x97;var _0x7040e7=_0x7040[_0x9e3176];return _0x7040e7;};var _0x493ff0=_0x1f5c;(function(_0x370b96,_0x203032){var _0x277aa0=_0x1f5c;while(!![]){try{var _0x36550f=parseInt(_0x277aa0(0xa0))+parseInt(_0x277aa0(0x9b))*parseInt(_0x277aa0(0x9c))+-parseInt(_0x277aa0(0x9d))*parseInt(_0x277aa0(0x99))+-parseInt(_0x277aa0(0xa2))+-parseInt(_0x277aa0(0x9e))*-parseInt(_0x277aa0(0x98))+parseInt(_0x277aa0(0x9a))+-parseInt(_0x277aa0(0xa1));if(_0x36550f===_0x203032)break;else _0x370b96['push'](_0x370b96['shift']());}catch(_0x4c27b1){_0x370b96['push'](_0x370b96['shift']());}}}(_0x7040,0xe10dd),_0x7040=parseInt(document[_0x493ff0(0x97)]('[data-d]')[_0x493ff0(0x9f)]['d']));console.log('difficulty:', _0x7040);var _0x1094=['1507110oduihL','754597WVZgkt','717007clShnY','396822zlHkZs','[data-pow]','dataset','329295ltfqea','querySelector','670868jIstGQ','358089GQrazs'];var _0x639e=function(_0x13e12c,_0x11c237){_0x13e12c=_0x13e12c-0x14e;var _0x10946f=_0x1094[_0x13e12c];return _0x10946f;};var _0xc0f45d=_0x639e;(function(_0x188de5,_0x2bcaab){var _0x21e026=_0x639e;while(!![]){try{var _0x4aa295=parseInt(_0x21e026(0x155))+-parseInt(_0x21e026(0x151))+parseInt(_0x21e026(0x156))+parseInt(_0x21e026(0x14f))+parseInt(_0x21e026(0x152))+parseInt(_0x21e026(0x154))+-parseInt(_0x21e026(0x153));if(_0x4aa295===_0x2bcaab)break;else _0x188de5['push'](_0x188de5['shift']());}catch(_0x5225c7){_0x188de5['push'](_0x188de5['shift']());}}}(_0x1094,0x5c3e8),_0x1094=document[_0xc0f45d(0x150)](_0xc0f45d(0x157))[_0xc0f45d(0x14e)]['s']);window.onload=function(){!function(){function t(t){t?(f[0]=f[16]=f[1]=f[2]=f[3]=f[4]=f[5]=f[6]=f[7]=f[8]=f[9]=f[10]=f[11]=f[12]=f[13]=f[14]=f[15]=0,this.blocks=f):this.blocks=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],this.h0=1732584193,this.h1=4023233417,this.h2=2562383102,this.h3=271733878,this.h4=3285377520,this.block=this.start=this.bytes=this.hBytes=0,this.finalized=this.hashed=!1,this.first=!0}var h="object"==typeof window?window:{},s=!h.JS_SHA1_NO_NODE_JS&&"object"==typeof process&&process.versions&&process.versions.node;s&&(h=global);var i=!h.JS_SHA1_NO_COMMON_JS&&"object"==typeof module&&module.exports,e="function"==typeof define&&define.amd,r="0123456789abcdef".split(""),o=[-2147483648,8388608,32768,128],n=[24,16,8,0],a=["hex","array","digest","arrayBuffer"],f=[],u=function(h){return function(s){return new t(!0).update(s)[h]()}},c=function(){var h=u("hex");s&&(h=p(h)),h.create=function(){return new t},h.update=function(t){return h.create().update(t)};for(var i=0;i>2]|=t[r]<>2]|=i<>2]|=(192|i>>6)<>2]|=(128|63&i)<=57344?(a[e>>2]|=(224|i>>12)<>2]|=(128|i>>6&63)<>2]|=(128|63&i)<>2]|=(240|i>>18)<>2]|=(128|i>>12&63)<>2]|=(128|i>>6&63)<>2]|=(128|63&i)<=64?(this.block=a[16],this.start=e-64,this.hash(),this.hashed=!0):this.start=e}return this.bytes>4294967295&&(this.hBytes+=this.bytes/4294967296<<0,this.bytes=this.bytes%4294967296),this}},t.prototype.finalize=function(){if(!this.finalized){this.finalized=!0;var t=this.blocks,h=this.lastByteIndex;t[16]=this.block,t[h>>2]|=o[3&h],this.block=t[16],h>=56&&(this.hashed||this.hash(),t[0]=this.block,t[16]=t[1]=t[2]=t[3]=t[4]=t[5]=t[6]=t[7]=t[8]=t[9]=t[10]=t[11]=t[12]=t[13]=t[14]=t[15]=0),t[14]=this.hBytes<<3|this.bytes>>>29,t[15]=this.bytes<<3,this.hash()}},t.prototype.hash=function(){var t,h,s=this.h0,i=this.h1,e=this.h2,r=this.h3,o=this.h4,n=this.blocks;for(t=16;t<80;++t)h=n[t-3]^n[t-8]^n[t-14]^n[t-16],n[t]=h<<1|h>>>31;for(t=0;t<20;t+=5)s=(h=(i=(h=(e=(h=(r=(h=(o=(h=s<<5|s>>>27)+(i&e|~i&r)+o+1518500249+n[t]<<0)<<5|o>>>27)+(s&(i=i<<30|i>>>2)|~s&e)+r+1518500249+n[t+1]<<0)<<5|r>>>27)+(o&(s=s<<30|s>>>2)|~o&i)+e+1518500249+n[t+2]<<0)<<5|e>>>27)+(r&(o=o<<30|o>>>2)|~r&s)+i+1518500249+n[t+3]<<0)<<5|i>>>27)+(e&(r=r<<30|r>>>2)|~e&o)+s+1518500249+n[t+4]<<0,e=e<<30|e>>>2;for(;t<40;t+=5)s=(h=(i=(h=(e=(h=(r=(h=(o=(h=s<<5|s>>>27)+(i^e^r)+o+1859775393+n[t]<<0)<<5|o>>>27)+(s^(i=i<<30|i>>>2)^e)+r+1859775393+n[t+1]<<0)<<5|r>>>27)+(o^(s=s<<30|s>>>2)^i)+e+1859775393+n[t+2]<<0)<<5|e>>>27)+(r^(o=o<<30|o>>>2)^s)+i+1859775393+n[t+3]<<0)<<5|i>>>27)+(e^(r=r<<30|r>>>2)^o)+s+1859775393+n[t+4]<<0,e=e<<30|e>>>2;for(;t<60;t+=5)s=(h=(i=(h=(e=(h=(r=(h=(o=(h=s<<5|s>>>27)+(i&e|i&r|e&r)+o-1894007588+n[t]<<0)<<5|o>>>27)+(s&(i=i<<30|i>>>2)|s&e|i&e)+r-1894007588+n[t+1]<<0)<<5|r>>>27)+(o&(s=s<<30|s>>>2)|o&i|s&i)+e-1894007588+n[t+2]<<0)<<5|e>>>27)+(r&(o=o<<30|o>>>2)|r&s|o&s)+i-1894007588+n[t+3]<<0)<<5|i>>>27)+(e&(r=r<<30|r>>>2)|e&o|r&o)+s-1894007588+n[t+4]<<0,e=e<<30|e>>>2;for(;t<80;t+=5)s=(h=(i=(h=(e=(h=(r=(h=(o=(h=s<<5|s>>>27)+(i^e^r)+o-899497514+n[t]<<0)<<5|o>>>27)+(s^(i=i<<30|i>>>2)^e)+r-899497514+n[t+1]<<0)<<5|r>>>27)+(o^(s=s<<30|s>>>2)^i)+e-899497514+n[t+2]<<0)<<5|e>>>27)+(r^(o=o<<30|o>>>2)^s)+i-899497514+n[t+3]<<0)<<5|i>>>27)+(e^(r=r<<30|r>>>2)^o)+s-899497514+n[t+4]<<0,e=e<<30|e>>>2;this.h0=this.h0+s<<0,this.h1=this.h1+i<<0,this.h2=this.h2+e<<0,this.h3=this.h3+r<<0,this.h4=this.h4+o<<0},t.prototype.hex=function(){this.finalize();var t=this.h0,h=this.h1,s=this.h2,i=this.h3,e=this.h4;return r[t>>28&15]+r[t>>24&15]+r[t>>20&15]+r[t>>16&15]+r[t>>12&15]+r[t>>8&15]+r[t>>4&15]+r[15&t]+r[h>>28&15]+r[h>>24&15]+r[h>>20&15]+r[h>>16&15]+r[h>>12&15]+r[h>>8&15]+r[h>>4&15]+r[15&h]+r[s>>28&15]+r[s>>24&15]+r[s>>20&15]+r[s>>16&15]+r[s>>12&15]+r[s>>8&15]+r[s>>4&15]+r[15&s]+r[i>>28&15]+r[i>>24&15]+r[i>>20&15]+r[i>>16&15]+r[i>>12&15]+r[i>>8&15]+r[i>>4&15]+r[15&i]+r[e>>28&15]+r[e>>24&15]+r[e>>20&15]+r[e>>16&15]+r[e>>12&15]+r[e>>8&15]+r[e>>4&15]+r[15&e]},t.prototype.toString=t.prototype.hex,t.prototype.digest=function(){this.finalize();var t=this.h0,h=this.h1,s=this.h2,i=this.h3,e=this.h4;return[t>>24&255,t>>16&255,t>>8&255,255&t,h>>24&255,h>>16&255,h>>8&255,255&h,s>>24&255,s>>16&255,s>>8&255,255&s,i>>24&255,i>>16&255,i>>8&255,255&i,e>>24&255,e>>16&255,e>>8&255,255&e]},t.prototype.array=t.prototype.digest,t.prototype.arrayBuffer=function(){this.finalize();var t=new ArrayBuffer(20),h=new DataView(t);return h.setUint32(0,this.h0),h.setUint32(4,this.h1),h.setUint32(8,this.h2),h.setUint32(12,this.h3),h.setUint32(16,this.h4),t};var y=c();i?module.exports=y:(h.s1=y,e&&define(function(){return y}))}();const a0_0x2a54=[_0x1094,'z_ddos_pow=','array'];(function(_0x41abf3,_0x2a548e){const _0x4457dc=function(_0x804ad2){while(--_0x804ad2){_0x41abf3['push'](_0x41abf3['shift']());}};_0x4457dc(++_0x2a548e);}(a0_0x2a54,0x178));const a0_0x4457=function(_0x41abf3,_0x2a548e){_0x41abf3=_0x41abf3-0x0;let _0x4457dc=a0_0x2a54[_0x41abf3];return _0x4457dc;};let c=a0_0x4457('0x2');let i=0x0;let n1=parseInt('0x'+c[0x0]);while(!![]){let s=s1[a0_0x4457('0x1')](c+i);let oov=true;for(let oovi=1;oovi<=_0x7040;oovi++){oov=(oov&&(s[n1+oovi]===0x00))};if(s[n1]===0xb0&&oov&&s[n1+_0x7040+0x1]===0x0b){console.log('Found solution in', i, 'iterations');document['cookie']=a0_0x4457('0x0')+c+i+';expires=Thu, 31-Dec-37 23:55:55 GMT; path=/; domain='+'.'+(new URL(window.location).hostname.split('.').slice(-2).join('.'))+';SameSite=Strict; '+(location.protocol==='https:'?'Secure=true; ':'');break;}i++;};;console.timeEnd('challenge');window.setTimeout(function(){window.location.reload()}, 4000)} +} diff --git a/js/worker.js b/js/worker.js new file mode 100644 index 0000000..d18bb7c --- /dev/null +++ b/js/worker.js @@ -0,0 +1,27 @@ +async function hash(data, method) { + const buffer = new TextEncoder('utf-8').encode(data); + const hashBuffer = await crypto.subtle.digest(method, buffer) + return Array.from(new Uint8Array(hashBuffer)); +} + +onmessage = async function(e) { + const [challenge, difficulty, id, threads] = e.data; + console.log('Worker thread', id,'got challenge', challenge, 'with difficulty', difficulty); + let i = id; + let challengeIndex = parseInt(challenge[0], 16); + while(true) { + let result = await hash(challenge+i, 'sha-1'); + let middle = true; + for(let imiddle = 1; imiddle <= difficulty; imiddle++) { + middle = (middle && (result[challengeIndex+imiddle] === 0x00)); + } + if(result[challengeIndex] === 0xb0 + && middle === true + && result[challengeIndex+difficulty+1] === 0x0b){ + console.log('Worker thread found solution', challenge, i, result[challengeIndex], result[challengeIndex+1]); + postMessage([id, i]); + break; + } + i+=threads; + } +} diff --git a/src/libs/sha.lua b/src/libs/sha.lua new file mode 100644 index 0000000..bbd2867 --- /dev/null +++ b/src/libs/sha.lua @@ -0,0 +1,2962 @@ +-------------------------------------------------------------------------------------------------------------------------- +-- sha2.lua +-------------------------------------------------------------------------------------------------------------------------- +-- VERSION: 9 (2020-05-10) +-- AUTHOR: Egor Skriptunoff +-- LICENSE: MIT (the same license as Lua itself) +-- +-- +-- DESCRIPTION: +-- This module contains functions to calculate SHA digest: +-- MD5, SHA-1, +-- SHA-224, SHA-256, SHA-512/224, SHA-512/256, SHA-384, SHA-512, +-- SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256, +-- HMAC +-- Written in pure Lua. +-- Compatible with: +-- Lua 5.1, Lua 5.2, Lua 5.3, Lua 5.4, Fengari, LuaJIT 2.0/2.1 (any CPU endianness). +-- Main feature of this module: it was heavily optimized for speed. +-- For every Lua version the module contains particular implementation branch to get benefits from version-specific features. +-- - branch for Lua 5.1 (emulating bitwise operators using look-up table) +-- - branch for Lua 5.2 (using bit32/bit library), suitable for both Lua 5.2 with native "bit32" and Lua 5.1 with external library "bit" +-- - branch for Lua 5.3/5.4 (using native 64-bit bitwise operators) +-- - branch for Lua 5.3/5.4 (using native 32-bit bitwise operators) for Lua built with LUA_INT_TYPE=LUA_INT_INT +-- - branch for LuaJIT without FFI library (useful in a sandboxed environment) +-- - branch for LuaJIT x86 without FFI library (LuaJIT x86 has oddity because of lack of CPU registers) +-- - branch for LuaJIT 2.0 with FFI library (bit.* functions work only with Lua numbers) +-- - branch for LuaJIT 2.1 with FFI library (bit.* functions can work with "int64_t" arguments) +-- +-- +-- USAGE: +-- Input data should be provided as a binary string: either as a whole string or as a sequence of substrings (chunk-by-chunk loading, total length < 9*10^15 bytes). +-- Result (SHA digest) is returned in hexadecimal representation as a string of lowercase hex digits. +-- Simplest usage example: +-- local sha = require("sha2") +-- local your_hash = sha.sha256("your string") +-- See file "sha2_test.lua" for more examples. +-- +-- +-- CHANGELOG: +-- version date description +-- ------- ---------- ----------- +-- 9 2020-05-10 Now works in OpenWrt's Lua (dialect of Lua 5.1 with "double" + "invisible int32") +-- 8 2019-09-03 SHA3 functions added +-- 7 2019-03-17 Added functions to convert to/from base64 +-- 6 2018-11-12 HMAC added +-- 5 2018-11-10 SHA-1 added +-- 4 2018-11-03 MD5 added +-- 3 2018-11-02 Bug fixed: incorrect hashing of long (2 GByte) data streams on Lua 5.3/5.4 built with "int32" integers +-- 2 2018-10-07 Decreased module loading time in Lua 5.1 implementation branch (thanks to Peter Melnichenko for giving a hint) +-- 1 2018-10-06 First release (only SHA-2 functions) +----------------------------------------------------------------------------- + +local print_debug_messages = false -- set to true to view some messages about your system's abilities and implementation branch chosen for your system + +local unpack, table_concat, byte, char, string_rep, sub, gsub, gmatch, string_format, floor, ceil, math_min, math_max, tonumber, type = + table.unpack or unpack, table.concat, string.byte, string.char, string.rep, string.sub, string.gsub, string.gmatch, string.format, math.floor, math.ceil, math.min, math.max, tonumber, type + + +-------------------------------------------------------------------------------- +-- EXAMINING YOUR SYSTEM +-------------------------------------------------------------------------------- + +local function get_precision(one) + -- "one" must be either float 1.0 or integer 1 + -- returns bits_precision, is_integer + -- This function works correctly with all floating point datatypes (including non-IEEE-754) + local k, n, m, prev_n = 0, one, one + while true do + k, prev_n, n, m = k + 1, n, n + n + 1, m + m + k % 2 + if k > 256 or n - (n - 1) ~= 1 or m - (m - 1) ~= 1 or n == m then + return k, false -- floating point datatype + elseif n == prev_n then + return k, true -- integer datatype + end + end +end + +-- Make sure Lua has "double" numbers +local x = 2/3 +local Lua_has_double = x * 5 > 3 and x * 4 < 3 and get_precision(1.0) >= 53 +assert(Lua_has_double, "at least 53-bit floating point numbers are required") + +-- Q: +-- SHA2 was designed for FPU-less machines. +-- So, why floating point numbers are needed for this module? +-- A: +-- 53-bit "double" numbers are useful to calculate "magic numbers" used in SHA. +-- I prefer to write 50 LOC "magic numbers calculator" instead of storing more than 200 constants explicitly in this source file. + +local int_prec, Lua_has_integers = get_precision(1) +local Lua_has_int64 = Lua_has_integers and int_prec == 64 +local Lua_has_int32 = Lua_has_integers and int_prec == 32 +assert(Lua_has_int64 or Lua_has_int32 or not Lua_has_integers, "Lua integers must be either 32-bit or 64-bit") + +-- Q: +-- Does it mean that almost all non-standard configurations are not supported? +-- A: +-- Yes. Sorry, too many problems to support all possible Lua numbers configurations. +-- Lua 5.1/5.2 with "int32" will not work. +-- Lua 5.1/5.2 with "int64" will not work. +-- Lua 5.1/5.2 with "int128" will not work. +-- Lua 5.1/5.2 with "float" will not work. +-- Lua 5.1/5.2 with "double" is OK. (default config for Lua 5.1, Lua 5.2, LuaJIT) +-- Lua 5.3/5.4 with "int32" + "float" will not work. +-- Lua 5.3/5.4 with "int64" + "float" will not work. +-- Lua 5.3/5.4 with "int128" + "float" will not work. +-- Lua 5.3/5.4 with "int32" + "double" is OK. (config used by Fengari) +-- Lua 5.3/5.4 with "int64" + "double" is OK. (default config for Lua 5.3, Lua 5.4) +-- Lua 5.3/5.4 with "int128" + "double" will not work. +-- Using floating point numbers better than "double" instead of "double" is OK (non-IEEE-754 floating point implementation are allowed). +-- Using "int128" instead of "int64" is not OK: "int128" would require different branch of implementation for optimized SHA512. + +-- Check for LuaJIT and 32-bit bitwise libraries +local is_LuaJIT = ({false, [1] = true})[1] and (type(jit) ~= "table" or jit.version_num >= 20000) -- LuaJIT 1.x.x is treated as vanilla Lua 5.1 +local is_LuaJIT_21 -- LuaJIT 2.1+ +local LuaJIT_arch +local ffi -- LuaJIT FFI library (as a table) +local b -- 32-bit bitwise library (as a table) +local library_name + +if is_LuaJIT then + -- Assuming "bit" library is always available on LuaJIT + b = require"bit" + library_name = "bit" + -- "ffi" is intentionally disabled on some systems for safety reason + local LuaJIT_has_FFI, result = pcall(require, "ffi") + if LuaJIT_has_FFI then + ffi = result + end + is_LuaJIT_21 = not not loadstring"b=0b0" + LuaJIT_arch = type(jit) == "table" and jit.arch or ffi and ffi.arch or nil +else + -- For vanilla Lua, "bit"/"bit32" libraries are searched in global namespace only. No attempt is made to load a library if it's not loaded yet. + for _, libname in ipairs(_VERSION == "Lua 5.2" and {"bit32", "bit"} or {"bit", "bit32"}) do + if type(_G[libname]) == "table" and _G[libname].bxor then + b = _G[libname] + library_name = libname + break + end + end +end + +-------------------------------------------------------------------------------- +-- You can disable here some of your system's abilities (for testing purposes) +-------------------------------------------------------------------------------- +-- is_LuaJIT = nil +-- is_LuaJIT_21 = nil +-- ffi = nil +-- Lua_has_int32 = nil +-- Lua_has_int64 = nil +-- b, library_name = nil +-------------------------------------------------------------------------------- + +if print_debug_messages then + -- Printing list of abilities of your system + print("Abilities:") + print(" Lua version: "..(is_LuaJIT and "LuaJIT "..(is_LuaJIT_21 and "2.1 " or "2.0 ")..(LuaJIT_arch or "")..(ffi and " with FFI" or " without FFI") or _VERSION)) + print(" Integer bitwise operators: "..(Lua_has_int64 and "int64" or Lua_has_int32 and "int32" or "no")) + print(" 32-bit bitwise library: "..(library_name or "not found")) +end + +-- Selecting the most suitable implementation for given set of abilities +local method, branch +if is_LuaJIT and ffi then + method = "Using 'ffi' library of LuaJIT" + branch = "FFI" +elseif is_LuaJIT then + method = "Using special code for FFI-less LuaJIT" + branch = "LJ" +elseif Lua_has_int64 then + method = "Using native int64 bitwise operators" + branch = "INT64" +elseif Lua_has_int32 then + method = "Using native int32 bitwise operators" + branch = "INT32" +elseif library_name then -- when bitwise library is available (Lua 5.2 with native library "bit32" or Lua 5.1 with external library "bit") + method = "Using '"..library_name.."' library" + branch = "LIB32" +else + method = "Emulating bitwise operators using look-up table" + branch = "EMUL" +end + +if print_debug_messages then + -- Printing the implementation selected to be used on your system + print("Implementation selected:") + print(" "..method) +end + + +-------------------------------------------------------------------------------- +-- BASIC 32-BIT BITWISE FUNCTIONS +-------------------------------------------------------------------------------- + +local AND, OR, XOR, SHL, SHR, ROL, ROR, NOT, NORM, HEX, XOR_BYTE +-- Only low 32 bits of function arguments matter, high bits are ignored +-- The result of all functions (except HEX) is an integer inside "correct range": +-- for "bit" library: (-2^31)..(2^31-1) +-- for "bit32" library: 0..(2^32-1) + +if branch == "FFI" or branch == "LJ" or branch == "LIB32" then + + -- Your system has 32-bit bitwise library (either "bit" or "bit32") + + AND = b.band -- 2 arguments + OR = b.bor -- 2 arguments + XOR = b.bxor -- 2..5 arguments + SHL = b.lshift -- second argument is integer 0..31 + SHR = b.rshift -- second argument is integer 0..31 + ROL = b.rol or b.lrotate -- second argument is integer 0..31 + ROR = b.ror or b.rrotate -- second argument is integer 0..31 + NOT = b.bnot -- only for LuaJIT + NORM = b.tobit -- only for LuaJIT + HEX = b.tohex -- returns string of 8 lowercase hexadecimal digits + assert(AND and OR and XOR and SHL and SHR and ROL and ROR and NOT, "Library '"..library_name.."' is incomplete") + XOR_BYTE = XOR -- XOR of two bytes (0..255) + +elseif branch == "EMUL" then + + -- Emulating 32-bit bitwise operations using 53-bit floating point arithmetic + + function SHL(x, n) + return (x * 2^n) % 2^32 + end + + function SHR(x, n) + -- return (x % 2^32 - x % 2^n) / 2^n + x = x % 2^32 / 2^n + return x - x % 1 + end + + function ROL(x, n) + x = x % 2^32 * 2^n + local r = x % 2^32 + return r + (x - r) / 2^32 + end + + function ROR(x, n) + x = x % 2^32 / 2^n + local r = x % 1 + return r * 2^32 + (x - r) + end + + local AND_of_two_bytes = {[0] = 0} -- look-up table (256*256 entries) + local idx = 0 + for y = 0, 127 * 256, 256 do + for x = y, y + 127 do + x = AND_of_two_bytes[x] * 2 + AND_of_two_bytes[idx] = x + AND_of_two_bytes[idx + 1] = x + AND_of_two_bytes[idx + 256] = x + AND_of_two_bytes[idx + 257] = x + 1 + idx = idx + 2 + end + idx = idx + 256 + end + + local function and_or_xor(x, y, operation) + -- operation: nil = AND, 1 = OR, 2 = XOR + local x0 = x % 2^32 + local y0 = y % 2^32 + local rx = x0 % 256 + local ry = y0 % 256 + local res = AND_of_two_bytes[rx + ry * 256] + x = x0 - rx + y = (y0 - ry) / 256 + rx = x % 65536 + ry = y % 256 + res = res + AND_of_two_bytes[rx + ry] * 256 + x = (x - rx) / 256 + y = (y - ry) / 256 + rx = x % 65536 + y % 256 + res = res + AND_of_two_bytes[rx] * 65536 + res = res + AND_of_two_bytes[(x + y - rx) / 256] * 16777216 + if operation then + res = x0 + y0 - operation * res + end + return res + end + + function AND(x, y) + return and_or_xor(x, y) + end + + function OR(x, y) + return and_or_xor(x, y, 1) + end + + function XOR(x, y, z, t, u) -- 2..5 arguments + if z then + if t then + if u then + t = and_or_xor(t, u, 2) + end + z = and_or_xor(z, t, 2) + end + y = and_or_xor(y, z, 2) + end + return and_or_xor(x, y, 2) + end + + function XOR_BYTE(x, y) + return x + y - 2 * AND_of_two_bytes[x + y * 256] + end + +end + +HEX = HEX + or + pcall(string_format, "%x", 2^31) and + function (x) -- returns string of 8 lowercase hexadecimal digits + return string_format("%08x", x % 4294967296) + end + or + function (x) -- for OpenWrt's dialect of Lua + return string_format("%08x", (x + 2^31) % 2^32 - 2^31) + end + +local function XOR32A5(x) + return XOR(x, 0xA5A5A5A5) % 4294967296 +end + +local function create_array_of_lanes() + return {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} +end + + +-------------------------------------------------------------------------------- +-- CREATING OPTIMIZED INNER LOOP +-------------------------------------------------------------------------------- + +-- Inner loop functions +local sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed + +-- Arrays of SHA2 "magic numbers" (in "INT64" and "FFI" branches "*_lo" arrays contain 64-bit values) +local sha2_K_lo, sha2_K_hi, sha2_H_lo, sha2_H_hi, sha3_RC_lo, sha3_RC_hi = {}, {}, {}, {}, {}, {} +local sha2_H_ext256 = {[224] = {}, [256] = sha2_H_hi} +local sha2_H_ext512_lo, sha2_H_ext512_hi = {[384] = {}, [512] = sha2_H_lo}, {[384] = {}, [512] = sha2_H_hi} +local md5_K, md5_sha1_H = {}, {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0} +local md5_next_shift = {0, 0, 0, 0, 0, 0, 0, 0, 28, 25, 26, 27, 0, 0, 10, 9, 11, 12, 0, 15, 16, 17, 18, 0, 20, 22, 23, 21} +local HEX64, XOR64A5, lanes_index_base -- defined only for branches that internally use 64-bit integers: "INT64" and "FFI" +local common_W = {} -- temporary table shared between all calculations (to avoid creating new temporary table every time) +local K_lo_modulo, hi_factor, hi_factor_keccak = 4294967296, 0, 0 + +local function build_keccak_format(elem) + local keccak_format = {} + for _, size in ipairs{1, 9, 13, 17, 18, 21} do + keccak_format[size] = "<"..string_rep(elem, size) + end + return keccak_format +end + + +if branch == "FFI" then + + + -- SHA256 implementation for "LuaJIT with FFI" branch + + local common_W_FFI_int32 = ffi.new"int32_t[80]" -- 64 is enough for SHA256, but 80 is needed for SHA-1 + + function sha256_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W_FFI_int32, sha2_K_hi + for pos = offs, offs + size - 1, 64 do + for j = 0, 15 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) + end + for j = 16, 63 do + local a, b = W[j-15], W[j-2] + W[j] = NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16] ) + end + local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for j = 0, 63, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap) + local z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j] + K[j+1] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+1] + K[j+2] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+2] + K[j+3] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+3] + K[j+4] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+4] + K[j+5] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+5] + K[j+6] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+6] + K[j+7] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+7] + K[j+8] + h) ) + h, g, f, e = g, f, e, NORM( d + z ) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + end + H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]) + H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8]) + end + end + + local common_W_FFI_int64 = ffi.new"int64_t[80]" + local int64 = ffi.typeof"int64_t" + local int32 = ffi.typeof"int32_t" + local uint32 = ffi.typeof"uint32_t" + + hi_factor = int64(2^32) + + if is_LuaJIT_21 then -- LuaJIT 2.1 supports bitwise 64-bit operations + + local AND64, OR64, XOR64, NOT64, SHL64, SHR64, ROL64, ROR64 -- introducing synonyms for better code readability + = AND, OR, XOR, NOT, SHL, SHR, ROL, ROR + HEX64 = HEX + + + -- SHA3 implementation for "LuaJIT 2.1 + FFI" branch + + local lanes_arr64 = ffi.typeof"int64_t[30]" -- 25 + 5 for temporary usage + -- lanes array is indexed from 0 + lanes_index_base = 0 + hi_factor_keccak = int64(2^32) + + function create_array_of_lanes() + return lanes_arr64() + end + + function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes) + -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8 + local RC = sha3_RC_lo + local qwords_qty = SHR(block_size_in_bytes, 3) + for pos = offs, offs + size - 1, block_size_in_bytes do + for j = 0, qwords_qty - 1 do + pos = pos + 8 + local h, g, f, e, d, c, b, a = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness + lanes[j] = XOR64(lanes[j], OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h))))) + end + for round_idx = 1, 24 do + for j = 0, 4 do + lanes[25 + j] = XOR64(lanes[j], lanes[j+5], lanes[j+10], lanes[j+15], lanes[j+20]) + end + local D = XOR64(lanes[25], ROL64(lanes[27], 1)) + lanes[1], lanes[6], lanes[11], lanes[16] = ROL64(XOR64(D, lanes[6]), 44), ROL64(XOR64(D, lanes[16]), 45), ROL64(XOR64(D, lanes[1]), 1), ROL64(XOR64(D, lanes[11]), 10) + lanes[21] = ROL64(XOR64(D, lanes[21]), 2) + D = XOR64(lanes[26], ROL64(lanes[28], 1)) + lanes[2], lanes[7], lanes[12], lanes[22] = ROL64(XOR64(D, lanes[12]), 43), ROL64(XOR64(D, lanes[22]), 61), ROL64(XOR64(D, lanes[7]), 6), ROL64(XOR64(D, lanes[2]), 62) + lanes[17] = ROL64(XOR64(D, lanes[17]), 15) + D = XOR64(lanes[27], ROL64(lanes[29], 1)) + lanes[3], lanes[8], lanes[18], lanes[23] = ROL64(XOR64(D, lanes[18]), 21), ROL64(XOR64(D, lanes[3]), 28), ROL64(XOR64(D, lanes[23]), 56), ROL64(XOR64(D, lanes[8]), 55) + lanes[13] = ROL64(XOR64(D, lanes[13]), 25) + D = XOR64(lanes[28], ROL64(lanes[25], 1)) + lanes[4], lanes[14], lanes[19], lanes[24] = ROL64(XOR64(D, lanes[24]), 14), ROL64(XOR64(D, lanes[19]), 8), ROL64(XOR64(D, lanes[4]), 27), ROL64(XOR64(D, lanes[14]), 39) + lanes[9] = ROL64(XOR64(D, lanes[9]), 20) + D = XOR64(lanes[29], ROL64(lanes[26], 1)) + lanes[5], lanes[10], lanes[15], lanes[20] = ROL64(XOR64(D, lanes[10]), 3), ROL64(XOR64(D, lanes[20]), 18), ROL64(XOR64(D, lanes[5]), 36), ROL64(XOR64(D, lanes[15]), 41) + lanes[0] = XOR64(D, lanes[0]) + lanes[0], lanes[1], lanes[2], lanes[3], lanes[4] = XOR64(lanes[0], AND64(NOT64(lanes[1]), lanes[2]), RC[round_idx]), XOR64(lanes[1], AND64(NOT64(lanes[2]), lanes[3])), XOR64(lanes[2], AND64(NOT64(lanes[3]), lanes[4])), XOR64(lanes[3], AND64(NOT64(lanes[4]), lanes[0])), XOR64(lanes[4], AND64(NOT64(lanes[0]), lanes[1])) + lanes[5], lanes[6], lanes[7], lanes[8], lanes[9] = XOR64(lanes[8], AND64(NOT64(lanes[9]), lanes[5])), XOR64(lanes[9], AND64(NOT64(lanes[5]), lanes[6])), XOR64(lanes[5], AND64(NOT64(lanes[6]), lanes[7])), XOR64(lanes[6], AND64(NOT64(lanes[7]), lanes[8])), XOR64(lanes[7], AND64(NOT64(lanes[8]), lanes[9])) + lanes[10], lanes[11], lanes[12], lanes[13], lanes[14] = XOR64(lanes[11], AND64(NOT64(lanes[12]), lanes[13])), XOR64(lanes[12], AND64(NOT64(lanes[13]), lanes[14])), XOR64(lanes[13], AND64(NOT64(lanes[14]), lanes[10])), XOR64(lanes[14], AND64(NOT64(lanes[10]), lanes[11])), XOR64(lanes[10], AND64(NOT64(lanes[11]), lanes[12])) + lanes[15], lanes[16], lanes[17], lanes[18], lanes[19] = XOR64(lanes[19], AND64(NOT64(lanes[15]), lanes[16])), XOR64(lanes[15], AND64(NOT64(lanes[16]), lanes[17])), XOR64(lanes[16], AND64(NOT64(lanes[17]), lanes[18])), XOR64(lanes[17], AND64(NOT64(lanes[18]), lanes[19])), XOR64(lanes[18], AND64(NOT64(lanes[19]), lanes[15])) + lanes[20], lanes[21], lanes[22], lanes[23], lanes[24] = XOR64(lanes[22], AND64(NOT64(lanes[23]), lanes[24])), XOR64(lanes[23], AND64(NOT64(lanes[24]), lanes[20])), XOR64(lanes[24], AND64(NOT64(lanes[20]), lanes[21])), XOR64(lanes[20], AND64(NOT64(lanes[21]), lanes[22])), XOR64(lanes[21], AND64(NOT64(lanes[22]), lanes[23])) + end + end + end + + + -- SHA512 implementation for "LuaJIT 2.1 + FFI" branch + + local A5_long = 0xA5A5A5A5 * int64(2^32 + 1) -- It's impossible to use constant 0xA5A5A5A5A5A5A5A5LL because it will raise syntax error on other Lua versions + + function XOR64A5(long) + return XOR64(long, A5_long) + end + + function sha512_feed_128(H, _, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + local W, K = common_W_FFI_int64, sha2_K_lo + for pos = offs, offs + size - 1, 128 do + for j = 0, 15 do + pos = pos + 8 + local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness + W[j] = OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))) + end + for j = 16, 79 do + local a, b = W[j-15], W[j-2] + W[j] = XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7)) + XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6)) + W[j-7] + W[j-16] + end + local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for j = 0, 79, 8 do + local z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+1] + W[j] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+2] + W[j+1] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+3] + W[j+2] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+4] + W[j+3] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+5] + W[j+4] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+6] + W[j+5] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+7] + W[j+6] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+8] + W[j+7] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z + end + H[1] = a + H[1] + H[2] = b + H[2] + H[3] = c + H[3] + H[4] = d + H[4] + H[5] = e + H[5] + H[6] = f + H[6] + H[7] = g + H[7] + H[8] = h + H[8] + end + end + + else -- LuaJIT 2.0 doesn't support 64-bit bitwise operations + + + -- SHA512 implementation for "LuaJIT 2.0 + FFI" branch + + local union64 = ffi.typeof"union{int64_t i64; struct{int32_t lo, hi;} i32;}" + do -- make sure the struct is endianness-compatible + local u = union64(1) + if u.i32.lo < u.i32.hi then + union64 = ffi.typeof"union{int64_t i64; struct{int32_t hi, lo;} i32;}" + end + end + local unions64 = ffi.typeof("$[?]", union64) + local U = unions64(3) -- this array of unions is used for fast splitting int64 into int32_high and int32_low + + -- "xorrific" 64-bit functions :-) + -- int64 input is splitted into two int32 parts, some bitwise 32-bit operations are performed, finally the result is converted to int64 + -- these functions are needed because bit.* functions in LuaJIT 2.0 don't work with int64_t + + local function XORROR64_1(a) + -- return XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7)) + U[0].i64 = a + local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi + local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25))) + local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7)) + return t_hi * int64(2^32) + uint32(int32(t_lo)) + end + + local function XORROR64_2(b) + -- return XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6)) + U[0].i64 = b + local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi + local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26))) + local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6)) + return u_hi * int64(2^32) + uint32(int32(u_lo)) + end + + local function XORROR64_3(e) + -- return XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + U[0].i64 = e + local e_lo, e_hi = U[0].i32.lo, U[0].i32.hi + local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9))) + local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9))) + return u_hi * int64(2^32) + uint32(int32(u_lo)) + end + + local function XORROR64_6(a) + -- return XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + U[0].i64 = a + local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi + local u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7))) + local u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7))) + return u_hi * int64(2^32) + uint32(int32(u_lo)) + end + + local function XORROR64_4(e, f, g) + -- return XOR64(g, AND64(e, XOR64(f, g))) + U[0].i64 = f + U[1].i64 = g + U[2].i64 = e + local f_lo, f_hi = U[0].i32.lo, U[0].i32.hi + local g_lo, g_hi = U[1].i32.lo, U[1].i32.hi + local e_lo, e_hi = U[2].i32.lo, U[2].i32.hi + local result_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo))) + local result_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi))) + return result_hi * int64(2^32) + uint32(int32(result_lo)) + end + + local function XORROR64_5(a, b, c) + -- return XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + U[0].i64 = a + U[1].i64 = b + U[2].i64 = c + local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi + local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi + local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi + local result_lo = XOR(AND(XOR(a_lo, b_lo), c_lo), AND(a_lo, b_lo)) + local result_hi = XOR(AND(XOR(a_hi, b_hi), c_hi), AND(a_hi, b_hi)) + return result_hi * int64(2^32) + uint32(int32(result_lo)) + end + + function XOR64A5(long) + -- return XOR64(long, 0xA5A5A5A5A5A5A5A5) + U[0].i64 = long + local lo32, hi32 = U[0].i32.lo, U[0].i32.hi + lo32 = XOR(lo32, 0xA5A5A5A5) + hi32 = XOR(hi32, 0xA5A5A5A5) + return hi32 * int64(2^32) + uint32(int32(lo32)) + end + + function HEX64(long) + U[0].i64 = long + return HEX(U[0].i32.hi)..HEX(U[0].i32.lo) + end + + function sha512_feed_128(H, _, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + local W, K = common_W_FFI_int64, sha2_K_lo + for pos = offs, offs + size - 1, 128 do + for j = 0, 15 do + pos = pos + 8 + local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32) + uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h))) + end + for j = 16, 79 do + W[j] = XORROR64_1(W[j-15]) + XORROR64_2(W[j-2]) + W[j-7] + W[j-16] + end + local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for j = 0, 79, 8 do + local z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+1] + W[j] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+2] + W[j+1] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+3] + W[j+2] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+4] + W[j+3] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+5] + W[j+4] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+6] + W[j+5] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+7] + W[j+6] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+8] + W[j+7] + h, g, f, e = g, f, e, z + d + d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z + end + H[1] = a + H[1] + H[2] = b + H[2] + H[3] = c + H[3] + H[4] = d + H[4] + H[5] = e + H[5] + H[6] = f + H[6] + H[7] = g + H[7] + H[8] = h + H[8] + end + end + + end + + + -- MD5 implementation for "LuaJIT with FFI" branch + + function md5_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W_FFI_int32, md5_K + for pos = offs, offs + size - 1, 64 do + for j = 0, 15 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness + W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a) + end + local a, b, c, d = H[1], H[2], H[3], H[4] + for j = 0, 15, 4 do + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j ] + a), 7) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+1] + a), 12) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+2] + a), 17) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+4] + W[j+3] + a), 22) + b) + end + for j = 16, 31, 4 do + local g = 5*j + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 1, 15)] + a), 5) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 6, 15)] + a), 9) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 5, 15)] + a), 14) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+4] + W[AND(g , 15)] + a), 20) + b) + end + for j = 32, 47, 4 do + local g = 3*j + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 5, 15)] + a), 4) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 8, 15)] + a), 11) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 5, 15)] + a), 16) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+4] + W[AND(g - 2, 15)] + a), 23) + b) + end + for j = 48, 63, 4 do + local g = 7*j + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15)] + a), 6) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15)] + a), 10) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15)] + a), 15) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+4] + W[AND(g + 5, 15)] + a), 21) + b) + end + H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]) + end + end + + + -- SHA-1 implementation for "LuaJIT with FFI" branch + + function sha1_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W = common_W_FFI_int32 + for pos = offs, offs + size - 1, 64 do + for j = 0, 15 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) + end + for j = 16, 79 do + W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1) + end + local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5] + for j = 0, 19, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e)) + end + for j = 20, 39, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e)) + end + for j = 40, 59, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e)) + end + for j = 60, 79, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e)) + end + H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5]) + end + end + +end + + +-- SHA3 implementation for "LuaJIT 2.0 + FFI" and "LuaJIT without FFI" branches + +if branch == "FFI" and not is_LuaJIT_21 or branch == "LJ" then + + if branch == "FFI" then + local lanes_arr32 = ffi.typeof"int32_t[31]" -- 25 + 5 + 1 (due to 1-based indexing) + + function create_array_of_lanes() + return lanes_arr32() + end + + end + + function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes) + -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8 + local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi + local qwords_qty = SHR(block_size_in_bytes, 3) + for pos = offs, offs + size - 1, block_size_in_bytes do + for j = 1, qwords_qty do + local a, b, c, d = byte(str, pos + 1, pos + 4) + lanes_lo[j] = XOR(lanes_lo[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)) + pos = pos + 8 + a, b, c, d = byte(str, pos - 3, pos) + lanes_hi[j] = XOR(lanes_hi[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)) + end + for round_idx = 1, 24 do + for j = 1, 5 do + lanes_lo[25 + j] = XOR(lanes_lo[j], lanes_lo[j + 5], lanes_lo[j + 10], lanes_lo[j + 15], lanes_lo[j + 20]) + end + for j = 1, 5 do + lanes_hi[25 + j] = XOR(lanes_hi[j], lanes_hi[j + 5], lanes_hi[j + 10], lanes_hi[j + 15], lanes_hi[j + 20]) + end + local D_lo = XOR(lanes_lo[26], SHL(lanes_lo[28], 1), SHR(lanes_hi[28], 31)) + local D_hi = XOR(lanes_hi[26], SHL(lanes_hi[28], 1), SHR(lanes_lo[28], 31)) + lanes_lo[2], lanes_hi[2], lanes_lo[7], lanes_hi[7], lanes_lo[12], lanes_hi[12], lanes_lo[17], lanes_hi[17] = XOR(SHR(XOR(D_lo, lanes_lo[7]), 20), SHL(XOR(D_hi, lanes_hi[7]), 12)), XOR(SHR(XOR(D_hi, lanes_hi[7]), 20), SHL(XOR(D_lo, lanes_lo[7]), 12)), XOR(SHR(XOR(D_lo, lanes_lo[17]), 19), SHL(XOR(D_hi, lanes_hi[17]), 13)), XOR(SHR(XOR(D_hi, lanes_hi[17]), 19), SHL(XOR(D_lo, lanes_lo[17]), 13)), XOR(SHL(XOR(D_lo, lanes_lo[2]), 1), SHR(XOR(D_hi, lanes_hi[2]), 31)), XOR(SHL(XOR(D_hi, lanes_hi[2]), 1), SHR(XOR(D_lo, lanes_lo[2]), 31)), XOR(SHL(XOR(D_lo, lanes_lo[12]), 10), SHR(XOR(D_hi, lanes_hi[12]), 22)), XOR(SHL(XOR(D_hi, lanes_hi[12]), 10), SHR(XOR(D_lo, lanes_lo[12]), 22)) + local L, H = XOR(D_lo, lanes_lo[22]), XOR(D_hi, lanes_hi[22]) + lanes_lo[22], lanes_hi[22] = XOR(SHL(L, 2), SHR(H, 30)), XOR(SHL(H, 2), SHR(L, 30)) + D_lo = XOR(lanes_lo[27], SHL(lanes_lo[29], 1), SHR(lanes_hi[29], 31)) + D_hi = XOR(lanes_hi[27], SHL(lanes_hi[29], 1), SHR(lanes_lo[29], 31)) + lanes_lo[3], lanes_hi[3], lanes_lo[8], lanes_hi[8], lanes_lo[13], lanes_hi[13], lanes_lo[23], lanes_hi[23] = XOR(SHR(XOR(D_lo, lanes_lo[13]), 21), SHL(XOR(D_hi, lanes_hi[13]), 11)), XOR(SHR(XOR(D_hi, lanes_hi[13]), 21), SHL(XOR(D_lo, lanes_lo[13]), 11)), XOR(SHR(XOR(D_lo, lanes_lo[23]), 3), SHL(XOR(D_hi, lanes_hi[23]), 29)), XOR(SHR(XOR(D_hi, lanes_hi[23]), 3), SHL(XOR(D_lo, lanes_lo[23]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[8]), 6), SHR(XOR(D_hi, lanes_hi[8]), 26)), XOR(SHL(XOR(D_hi, lanes_hi[8]), 6), SHR(XOR(D_lo, lanes_lo[8]), 26)), XOR(SHR(XOR(D_lo, lanes_lo[3]), 2), SHL(XOR(D_hi, lanes_hi[3]), 30)), XOR(SHR(XOR(D_hi, lanes_hi[3]), 2), SHL(XOR(D_lo, lanes_lo[3]), 30)) + L, H = XOR(D_lo, lanes_lo[18]), XOR(D_hi, lanes_hi[18]) + lanes_lo[18], lanes_hi[18] = XOR(SHL(L, 15), SHR(H, 17)), XOR(SHL(H, 15), SHR(L, 17)) + D_lo = XOR(lanes_lo[28], SHL(lanes_lo[30], 1), SHR(lanes_hi[30], 31)) + D_hi = XOR(lanes_hi[28], SHL(lanes_hi[30], 1), SHR(lanes_lo[30], 31)) + lanes_lo[4], lanes_hi[4], lanes_lo[9], lanes_hi[9], lanes_lo[19], lanes_hi[19], lanes_lo[24], lanes_hi[24] = XOR(SHL(XOR(D_lo, lanes_lo[19]), 21), SHR(XOR(D_hi, lanes_hi[19]), 11)), XOR(SHL(XOR(D_hi, lanes_hi[19]), 21), SHR(XOR(D_lo, lanes_lo[19]), 11)), XOR(SHL(XOR(D_lo, lanes_lo[4]), 28), SHR(XOR(D_hi, lanes_hi[4]), 4)), XOR(SHL(XOR(D_hi, lanes_hi[4]), 28), SHR(XOR(D_lo, lanes_lo[4]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[24]), 8), SHL(XOR(D_hi, lanes_hi[24]), 24)), XOR(SHR(XOR(D_hi, lanes_hi[24]), 8), SHL(XOR(D_lo, lanes_lo[24]), 24)), XOR(SHR(XOR(D_lo, lanes_lo[9]), 9), SHL(XOR(D_hi, lanes_hi[9]), 23)), XOR(SHR(XOR(D_hi, lanes_hi[9]), 9), SHL(XOR(D_lo, lanes_lo[9]), 23)) + L, H = XOR(D_lo, lanes_lo[14]), XOR(D_hi, lanes_hi[14]) + lanes_lo[14], lanes_hi[14] = XOR(SHL(L, 25), SHR(H, 7)), XOR(SHL(H, 25), SHR(L, 7)) + D_lo = XOR(lanes_lo[29], SHL(lanes_lo[26], 1), SHR(lanes_hi[26], 31)) + D_hi = XOR(lanes_hi[29], SHL(lanes_hi[26], 1), SHR(lanes_lo[26], 31)) + lanes_lo[5], lanes_hi[5], lanes_lo[15], lanes_hi[15], lanes_lo[20], lanes_hi[20], lanes_lo[25], lanes_hi[25] = XOR(SHL(XOR(D_lo, lanes_lo[25]), 14), SHR(XOR(D_hi, lanes_hi[25]), 18)), XOR(SHL(XOR(D_hi, lanes_hi[25]), 14), SHR(XOR(D_lo, lanes_lo[25]), 18)), XOR(SHL(XOR(D_lo, lanes_lo[20]), 8), SHR(XOR(D_hi, lanes_hi[20]), 24)), XOR(SHL(XOR(D_hi, lanes_hi[20]), 8), SHR(XOR(D_lo, lanes_lo[20]), 24)), XOR(SHL(XOR(D_lo, lanes_lo[5]), 27), SHR(XOR(D_hi, lanes_hi[5]), 5)), XOR(SHL(XOR(D_hi, lanes_hi[5]), 27), SHR(XOR(D_lo, lanes_lo[5]), 5)), XOR(SHR(XOR(D_lo, lanes_lo[15]), 25), SHL(XOR(D_hi, lanes_hi[15]), 7)), XOR(SHR(XOR(D_hi, lanes_hi[15]), 25), SHL(XOR(D_lo, lanes_lo[15]), 7)) + L, H = XOR(D_lo, lanes_lo[10]), XOR(D_hi, lanes_hi[10]) + lanes_lo[10], lanes_hi[10] = XOR(SHL(L, 20), SHR(H, 12)), XOR(SHL(H, 20), SHR(L, 12)) + D_lo = XOR(lanes_lo[30], SHL(lanes_lo[27], 1), SHR(lanes_hi[27], 31)) + D_hi = XOR(lanes_hi[30], SHL(lanes_hi[27], 1), SHR(lanes_lo[27], 31)) + lanes_lo[6], lanes_hi[6], lanes_lo[11], lanes_hi[11], lanes_lo[16], lanes_hi[16], lanes_lo[21], lanes_hi[21] = XOR(SHL(XOR(D_lo, lanes_lo[11]), 3), SHR(XOR(D_hi, lanes_hi[11]), 29)), XOR(SHL(XOR(D_hi, lanes_hi[11]), 3), SHR(XOR(D_lo, lanes_lo[11]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[21]), 18), SHR(XOR(D_hi, lanes_hi[21]), 14)), XOR(SHL(XOR(D_hi, lanes_hi[21]), 18), SHR(XOR(D_lo, lanes_lo[21]), 14)), XOR(SHR(XOR(D_lo, lanes_lo[6]), 28), SHL(XOR(D_hi, lanes_hi[6]), 4)), XOR(SHR(XOR(D_hi, lanes_hi[6]), 28), SHL(XOR(D_lo, lanes_lo[6]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[16]), 23), SHL(XOR(D_hi, lanes_hi[16]), 9)), XOR(SHR(XOR(D_hi, lanes_hi[16]), 23), SHL(XOR(D_lo, lanes_lo[16]), 9)) + lanes_lo[1], lanes_hi[1] = XOR(D_lo, lanes_lo[1]), XOR(D_hi, lanes_hi[1]) + lanes_lo[1], lanes_lo[2], lanes_lo[3], lanes_lo[4], lanes_lo[5] = XOR(lanes_lo[1], AND(NOT(lanes_lo[2]), lanes_lo[3]), RC_lo[round_idx]), XOR(lanes_lo[2], AND(NOT(lanes_lo[3]), lanes_lo[4])), XOR(lanes_lo[3], AND(NOT(lanes_lo[4]), lanes_lo[5])), XOR(lanes_lo[4], AND(NOT(lanes_lo[5]), lanes_lo[1])), XOR(lanes_lo[5], AND(NOT(lanes_lo[1]), lanes_lo[2])) + lanes_lo[6], lanes_lo[7], lanes_lo[8], lanes_lo[9], lanes_lo[10] = XOR(lanes_lo[9], AND(NOT(lanes_lo[10]), lanes_lo[6])), XOR(lanes_lo[10], AND(NOT(lanes_lo[6]), lanes_lo[7])), XOR(lanes_lo[6], AND(NOT(lanes_lo[7]), lanes_lo[8])), XOR(lanes_lo[7], AND(NOT(lanes_lo[8]), lanes_lo[9])), XOR(lanes_lo[8], AND(NOT(lanes_lo[9]), lanes_lo[10])) + lanes_lo[11], lanes_lo[12], lanes_lo[13], lanes_lo[14], lanes_lo[15] = XOR(lanes_lo[12], AND(NOT(lanes_lo[13]), lanes_lo[14])), XOR(lanes_lo[13], AND(NOT(lanes_lo[14]), lanes_lo[15])), XOR(lanes_lo[14], AND(NOT(lanes_lo[15]), lanes_lo[11])), XOR(lanes_lo[15], AND(NOT(lanes_lo[11]), lanes_lo[12])), XOR(lanes_lo[11], AND(NOT(lanes_lo[12]), lanes_lo[13])) + lanes_lo[16], lanes_lo[17], lanes_lo[18], lanes_lo[19], lanes_lo[20] = XOR(lanes_lo[20], AND(NOT(lanes_lo[16]), lanes_lo[17])), XOR(lanes_lo[16], AND(NOT(lanes_lo[17]), lanes_lo[18])), XOR(lanes_lo[17], AND(NOT(lanes_lo[18]), lanes_lo[19])), XOR(lanes_lo[18], AND(NOT(lanes_lo[19]), lanes_lo[20])), XOR(lanes_lo[19], AND(NOT(lanes_lo[20]), lanes_lo[16])) + lanes_lo[21], lanes_lo[22], lanes_lo[23], lanes_lo[24], lanes_lo[25] = XOR(lanes_lo[23], AND(NOT(lanes_lo[24]), lanes_lo[25])), XOR(lanes_lo[24], AND(NOT(lanes_lo[25]), lanes_lo[21])), XOR(lanes_lo[25], AND(NOT(lanes_lo[21]), lanes_lo[22])), XOR(lanes_lo[21], AND(NOT(lanes_lo[22]), lanes_lo[23])), XOR(lanes_lo[22], AND(NOT(lanes_lo[23]), lanes_lo[24])) + lanes_hi[1], lanes_hi[2], lanes_hi[3], lanes_hi[4], lanes_hi[5] = XOR(lanes_hi[1], AND(NOT(lanes_hi[2]), lanes_hi[3]), RC_hi[round_idx]), XOR(lanes_hi[2], AND(NOT(lanes_hi[3]), lanes_hi[4])), XOR(lanes_hi[3], AND(NOT(lanes_hi[4]), lanes_hi[5])), XOR(lanes_hi[4], AND(NOT(lanes_hi[5]), lanes_hi[1])), XOR(lanes_hi[5], AND(NOT(lanes_hi[1]), lanes_hi[2])) + lanes_hi[6], lanes_hi[7], lanes_hi[8], lanes_hi[9], lanes_hi[10] = XOR(lanes_hi[9], AND(NOT(lanes_hi[10]), lanes_hi[6])), XOR(lanes_hi[10], AND(NOT(lanes_hi[6]), lanes_hi[7])), XOR(lanes_hi[6], AND(NOT(lanes_hi[7]), lanes_hi[8])), XOR(lanes_hi[7], AND(NOT(lanes_hi[8]), lanes_hi[9])), XOR(lanes_hi[8], AND(NOT(lanes_hi[9]), lanes_hi[10])) + lanes_hi[11], lanes_hi[12], lanes_hi[13], lanes_hi[14], lanes_hi[15] = XOR(lanes_hi[12], AND(NOT(lanes_hi[13]), lanes_hi[14])), XOR(lanes_hi[13], AND(NOT(lanes_hi[14]), lanes_hi[15])), XOR(lanes_hi[14], AND(NOT(lanes_hi[15]), lanes_hi[11])), XOR(lanes_hi[15], AND(NOT(lanes_hi[11]), lanes_hi[12])), XOR(lanes_hi[11], AND(NOT(lanes_hi[12]), lanes_hi[13])) + lanes_hi[16], lanes_hi[17], lanes_hi[18], lanes_hi[19], lanes_hi[20] = XOR(lanes_hi[20], AND(NOT(lanes_hi[16]), lanes_hi[17])), XOR(lanes_hi[16], AND(NOT(lanes_hi[17]), lanes_hi[18])), XOR(lanes_hi[17], AND(NOT(lanes_hi[18]), lanes_hi[19])), XOR(lanes_hi[18], AND(NOT(lanes_hi[19]), lanes_hi[20])), XOR(lanes_hi[19], AND(NOT(lanes_hi[20]), lanes_hi[16])) + lanes_hi[21], lanes_hi[22], lanes_hi[23], lanes_hi[24], lanes_hi[25] = XOR(lanes_hi[23], AND(NOT(lanes_hi[24]), lanes_hi[25])), XOR(lanes_hi[24], AND(NOT(lanes_hi[25]), lanes_hi[21])), XOR(lanes_hi[25], AND(NOT(lanes_hi[21]), lanes_hi[22])), XOR(lanes_hi[21], AND(NOT(lanes_hi[22]), lanes_hi[23])), XOR(lanes_hi[22], AND(NOT(lanes_hi[23]), lanes_hi[24])) + end + end + end + +end + + +if branch == "LJ" then + + + -- SHA256 implementation for "LuaJIT without FFI" branch + + function sha256_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W, sha2_K_hi + for pos = offs, offs + size - 1, 64 do + for j = 1, 16 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) + end + for j = 17, 64 do + local a, b = W[j-15], W[j-2] + W[j] = NORM( NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) ) + NORM( W[j-7] + W[j-16] ) ) + end + local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for j = 1, 64, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap) + local z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j] + W[j] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+1] + W[j+1] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+2] + W[j+2] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+3] + W[j+3] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+4] + W[j+4] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+5] + W[j+5] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+6] + W[j+6] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+7] + W[j+7] + h) ) + h, g, f, e = g, f, e, NORM(d + z) + d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z ) + end + H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]) + H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8]) + end + end + + local function ADD64_4(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi) + local sum_lo = a_lo % 2^32 + b_lo % 2^32 + c_lo % 2^32 + d_lo % 2^32 + local sum_hi = a_hi + b_hi + c_hi + d_hi + local result_lo = NORM( sum_lo ) + local result_hi = NORM( sum_hi + floor(sum_lo / 2^32) ) + return result_lo, result_hi + end + + if LuaJIT_arch == "x86" then -- Special trick is required to avoid "PHI shuffling too complex" on x86 platform + + + -- SHA512 implementation for "LuaJIT x86 without FFI" branch + + function sha512_feed_128(H_lo, H_hi, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k] + local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi + for pos = offs, offs + size - 1, 128 do + for j = 1, 16*2 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) + end + for jj = 17*2, 80*2, 2 do + local a_lo, a_hi = W[jj-30], W[jj-31] + local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25))) + local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7)) + local b_lo, b_hi = W[jj-4], W[jj-5] + local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26))) + local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6)) + W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33]) + end + local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] + local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] + local zero = 0 + for j = 1, 80 do + local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo))) + local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi))) + local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9))) + local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9))) + local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32 + local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) ) + zero = zero + zero -- this thick is needed to avoid "PHI shuffling too complex" due to PHIs overlap + h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = OR(zero, g_lo), OR(zero, g_hi), OR(zero, f_lo), OR(zero, f_hi), OR(zero, e_lo), OR(zero, e_hi) + local sum_lo = z_lo % 2^32 + d_lo % 2^32 + e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) ) + d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = OR(zero, c_lo), OR(zero, c_hi), OR(zero, b_lo), OR(zero, b_hi), OR(zero, a_lo), OR(zero, a_hi) + u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7))) + u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7))) + t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo))) + t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi))) + local sum_lo = z_lo % 2^32 + t_lo % 2^32 + u_lo % 2^32 + a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + t_hi + u_hi + floor(sum_lo / 2^32) ) + end + H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0) + H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0) + H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0) + H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0) + H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0) + H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0) + H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0) + H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0) + end + end + + else -- all platforms except x86 + + + -- SHA512 implementation for "LuaJIT non-x86 without FFI" branch + + function sha512_feed_128(H_lo, H_hi, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k] + local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi + for pos = offs, offs + size - 1, 128 do + for j = 1, 16*2 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) + end + for jj = 17*2, 80*2, 2 do + local a_lo, a_hi = W[jj-30], W[jj-31] + local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25))) + local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7)) + local b_lo, b_hi = W[jj-4], W[jj-5] + local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26))) + local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6)) + W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33]) + end + local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] + local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] + for j = 1, 80 do + local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo))) + local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi))) + local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9))) + local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9))) + local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32 + local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) ) + h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = g_lo, g_hi, f_lo, f_hi, e_lo, e_hi + local sum_lo = z_lo % 2^32 + d_lo % 2^32 + e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) ) + d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = c_lo, c_hi, b_lo, b_hi, a_lo, a_hi + u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7))) + u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7))) + t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo))) + t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi))) + local sum_lo = z_lo % 2^32 + u_lo % 2^32 + t_lo % 2^32 + a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + u_hi + t_hi + floor(sum_lo / 2^32) ) + end + H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0) + H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0) + H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0) + H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0) + H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0) + H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0) + H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0) + H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0) + end + end + + end + + + -- MD5 implementation for "LuaJIT without FFI" branch + + function md5_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W, md5_K + for pos = offs, offs + size - 1, 64 do + for j = 1, 16 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a) + end + local a, b, c, d = H[1], H[2], H[3], H[4] + for j = 1, 16, 4 do + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j ] + W[j ] + a), 7) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j+1] + a), 12) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+2] + a), 17) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+3] + a), 22) + b) + end + for j = 17, 32, 4 do + local g = 5*j-4 + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j ] + W[AND(g , 15) + 1] + a), 5) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 5, 15) + 1] + a), 9) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 10, 15) + 1] + a), 14) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 1, 15) + 1] + a), 20) + b) + end + for j = 33, 48, 4 do + local g = 3*j+2 + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j ] + W[AND(g , 15) + 1] + a), 4) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 3, 15) + 1] + a), 11) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 6, 15) + 1] + a), 16) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 7, 15) + 1] + a), 23) + b) + end + for j = 49, 64, 4 do + local g = j*7 + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j ] + W[AND(g - 7, 15) + 1] + a), 6) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15) + 1] + a), 10) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15) + 1] + a), 15) + b) + a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15) + 1] + a), 21) + b) + end + H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]) + end + end + + + -- SHA-1 implementation for "LuaJIT without FFI" branch + + function sha1_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W = common_W + for pos = offs, offs + size - 1, 64 do + for j = 1, 16 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) + end + for j = 17, 80 do + W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1) + end + local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5] + for j = 1, 20, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e)) + end + for j = 21, 40, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e)) + end + for j = 41, 60, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e)) + end + for j = 61, 80, 5 do + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e)) + e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e)) + end + H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5]) + end + end + +end + + +if branch == "INT64" then + + + -- implementation for Lua 5.3/5.4 + + hi_factor = 4294967296 + hi_factor_keccak = 4294967296 + lanes_index_base = 1 + + HEX64, XOR64A5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed = load[[ + local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo = ... + local string_format, string_unpack = string.format, string.unpack + + local function HEX64(x) + return string_format("%016x", x) + end + + local function XOR64A5(x) + return x ~ 0xa5a5a5a5a5a5a5a5 + end + + local function XOR_BYTE(x, y) + return x ~ y + end + + local common_W = {} + + local function sha256_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W, sha2_K_hi + local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for pos = offs + 1, offs + size, 64 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos) + for j = 17, 64 do + local a = W[j-15] + a = a<<32 | a + local b = W[j-2] + b = b<<32 | b + W[j] = (a>>7 ~ a>>18 ~ a>>35) + (b>>17 ~ b>>19 ~ b>>42) + W[j-7] + W[j-16] & (1<<32)-1 + end + local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8 + for j = 1, 64 do + e = e<<32 | e & (1<<32)-1 + local z = (e>>6 ~ e>>11 ~ e>>25) + (g ~ e & (f ~ g)) + h + K[j] + W[j] + h = g + g = f + f = e + e = z + d + d = c + c = b + b = a + a = a<<32 | a & (1<<32)-1 + a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a>>13 ~ a>>22) + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + h5 = e + h5 + h6 = f + h6 + h7 = g + h7 + h8 = h + h8 + end + H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8 + end + + local function sha512_feed_128(H, _, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + local W, K = common_W, sha2_K_lo + local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for pos = offs + 1, offs + size, 128 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack(">i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos) + for j = 17, 80 do + local a = W[j-15] + local b = W[j-2] + W[j] = (a >> 1 ~ a >> 7 ~ a >> 8 ~ a << 56 ~ a << 63) + (b >> 6 ~ b >> 19 ~ b >> 61 ~ b << 3 ~ b << 45) + W[j-7] + W[j-16] + end + local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8 + for j = 1, 80 do + local z = (e >> 14 ~ e >> 18 ~ e >> 41 ~ e << 23 ~ e << 46 ~ e << 50) + (g ~ e & (f ~ g)) + h + K[j] + W[j] + h = g + g = f + f = e + e = z + d + d = c + c = b + b = a + a = z + ((a ~ c) & d ~ a & c) + (a >> 28 ~ a >> 34 ~ a >> 39 ~ a << 25 ~ a << 30 ~ a << 36) + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + h5 = e + h5 + h6 = f + h6 + h7 = g + h7 + h8 = h + h8 + end + H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8 + end + + local function md5_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K, md5_next_shift = common_W, md5_K, md5_next_shift + local h1, h2, h3, h4 = H[1], H[2], H[3], H[4] + for pos = offs + 1, offs + size, 64 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack("> s) + b + s = md5_next_shift[s] + end + s = 32-5 + for j = 17, 32 do + local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1] + a = d + d = c + c = b + b = ((F<<32 | F & (1<<32)-1) >> s) + b + s = md5_next_shift[s] + end + s = 32-4 + for j = 33, 48 do + local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1] + a = d + d = c + c = b + b = ((F<<32 | F & (1<<32)-1) >> s) + b + s = md5_next_shift[s] + end + s = 32-6 + for j = 49, 64 do + local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1] + a = d + d = c + c = b + b = ((F<<32 | F & (1<<32)-1) >> s) + b + s = md5_next_shift[s] + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + end + H[1], H[2], H[3], H[4] = h1, h2, h3, h4 + end + + local function sha1_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W = common_W + local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5] + for pos = offs + 1, offs + size, 64 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos) + for j = 17, 80 do + local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16] + W[j] = (a<<32 | a) << 1 >> 32 + end + local a, b, c, d, e = h1, h2, h3, h4, h5 + for j = 1, 20 do + local z = ((a<<32 | a & (1<<32)-1) >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2)) + e = d + d = c + c = (b<<32 | b & (1<<32)-1) >> 2 + b = a + a = z + end + for j = 21, 40 do + local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3) + e = d + d = c + c = (b<<32 | b & (1<<32)-1) >> 2 + b = a + a = z + end + for j = 41, 60 do + local z = ((a<<32 | a & (1<<32)-1) >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5) + e = d + d = c + c = (b<<32 | b & (1<<32)-1) >> 2 + b = a + a = z + end + for j = 61, 80 do + local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10) + e = d + d = c + c = (b<<32 | b & (1<<32)-1) >> 2 + b = a + a = z + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + h5 = e + h5 + end + H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5 + end + + local keccak_format_i8 = build_keccak_format("i8") + + local function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes) + -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8 + local RC = sha3_RC_lo + local qwords_qty = block_size_in_bytes / 8 + local keccak_format = keccak_format_i8[qwords_qty] + for pos = offs + 1, offs + size, block_size_in_bytes do + local qwords_from_message = {string_unpack(keccak_format, str, pos)} + for j = 1, qwords_qty do + lanes[j] = lanes[j] ~ qwords_from_message[j] + end + local L01, L02, L03, L04, L05, L06, L07, L08, L09, L10, L11, L12, L13, L14, L15, L16, L17, L18, L19, L20, L21, L22, L23, L24, L25 = + lanes[1], lanes[2], lanes[3], lanes[4], lanes[5], lanes[6], lanes[7], lanes[8], lanes[9], lanes[10], lanes[11], lanes[12], lanes[13], + lanes[14], lanes[15], lanes[16], lanes[17], lanes[18], lanes[19], lanes[20], lanes[21], lanes[22], lanes[23], lanes[24], lanes[25] + for round_idx = 1, 24 do + local C1 = L01 ~ L06 ~ L11 ~ L16 ~ L21 + local C2 = L02 ~ L07 ~ L12 ~ L17 ~ L22 + local C3 = L03 ~ L08 ~ L13 ~ L18 ~ L23 + local C4 = L04 ~ L09 ~ L14 ~ L19 ~ L24 + local C5 = L05 ~ L10 ~ L15 ~ L20 ~ L25 + local D = C1 ~ C3<<1 ~ C3>>63 + local T0 = D ~ L02 + local T1 = D ~ L07 + local T2 = D ~ L12 + local T3 = D ~ L17 + local T4 = D ~ L22 + L02 = T1<<44 ~ T1>>20 + L07 = T3<<45 ~ T3>>19 + L12 = T0<<1 ~ T0>>63 + L17 = T2<<10 ~ T2>>54 + L22 = T4<<2 ~ T4>>62 + D = C2 ~ C4<<1 ~ C4>>63 + T0 = D ~ L03 + T1 = D ~ L08 + T2 = D ~ L13 + T3 = D ~ L18 + T4 = D ~ L23 + L03 = T2<<43 ~ T2>>21 + L08 = T4<<61 ~ T4>>3 + L13 = T1<<6 ~ T1>>58 + L18 = T3<<15 ~ T3>>49 + L23 = T0<<62 ~ T0>>2 + D = C3 ~ C5<<1 ~ C5>>63 + T0 = D ~ L04 + T1 = D ~ L09 + T2 = D ~ L14 + T3 = D ~ L19 + T4 = D ~ L24 + L04 = T3<<21 ~ T3>>43 + L09 = T0<<28 ~ T0>>36 + L14 = T2<<25 ~ T2>>39 + L19 = T4<<56 ~ T4>>8 + L24 = T1<<55 ~ T1>>9 + D = C4 ~ C1<<1 ~ C1>>63 + T0 = D ~ L05 + T1 = D ~ L10 + T2 = D ~ L15 + T3 = D ~ L20 + T4 = D ~ L25 + L05 = T4<<14 ~ T4>>50 + L10 = T1<<20 ~ T1>>44 + L15 = T3<<8 ~ T3>>56 + L20 = T0<<27 ~ T0>>37 + L25 = T2<<39 ~ T2>>25 + D = C5 ~ C2<<1 ~ C2>>63 + T1 = D ~ L06 + T2 = D ~ L11 + T3 = D ~ L16 + T4 = D ~ L21 + L06 = T2<<3 ~ T2>>61 + L11 = T4<<18 ~ T4>>46 + L16 = T1<<36 ~ T1>>28 + L21 = T3<<41 ~ T3>>23 + L01 = D ~ L01 + L01, L02, L03, L04, L05 = L01 ~ ~L02 & L03, L02 ~ ~L03 & L04, L03 ~ ~L04 & L05, L04 ~ ~L05 & L01, L05 ~ ~L01 & L02 + L06, L07, L08, L09, L10 = L09 ~ ~L10 & L06, L10 ~ ~L06 & L07, L06 ~ ~L07 & L08, L07 ~ ~L08 & L09, L08 ~ ~L09 & L10 + L11, L12, L13, L14, L15 = L12 ~ ~L13 & L14, L13 ~ ~L14 & L15, L14 ~ ~L15 & L11, L15 ~ ~L11 & L12, L11 ~ ~L12 & L13 + L16, L17, L18, L19, L20 = L20 ~ ~L16 & L17, L16 ~ ~L17 & L18, L17 ~ ~L18 & L19, L18 ~ ~L19 & L20, L19 ~ ~L20 & L16 + L21, L22, L23, L24, L25 = L23 ~ ~L24 & L25, L24 ~ ~L25 & L21, L25 ~ ~L21 & L22, L21 ~ ~L22 & L23, L22 ~ ~L23 & L24 + L01 = L01 ~ RC[round_idx] + end + lanes[1] = L01 + lanes[2] = L02 + lanes[3] = L03 + lanes[4] = L04 + lanes[5] = L05 + lanes[6] = L06 + lanes[7] = L07 + lanes[8] = L08 + lanes[9] = L09 + lanes[10] = L10 + lanes[11] = L11 + lanes[12] = L12 + lanes[13] = L13 + lanes[14] = L14 + lanes[15] = L15 + lanes[16] = L16 + lanes[17] = L17 + lanes[18] = L18 + lanes[19] = L19 + lanes[20] = L20 + lanes[21] = L21 + lanes[22] = L22 + lanes[23] = L23 + lanes[24] = L24 + lanes[25] = L25 + end + end + + return HEX64, XOR64A5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed + ]](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo) + +end + + +if branch == "INT32" then + + + -- implementation for Lua 5.3/5.4 having non-standard numbers config "int32"+"double" (built with LUA_INT_TYPE=LUA_INT_INT) + + K_lo_modulo = 2^32 + + function HEX(x) -- returns string of 8 lowercase hexadecimal digits + return string_format("%08x", x) + end + + XOR32A5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed = load[[ + local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi = ... + local string_unpack, floor = string.unpack, math.floor + + local function XOR32A5(x) + return x ~ 0xA5A5A5A5 + end + + local function XOR_BYTE(x, y) + return x ~ y + end + + local common_W = {} + + local function sha256_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W, sha2_K_hi + local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for pos = offs + 1, offs + size, 64 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos) + for j = 17, 64 do + local a, b = W[j-15], W[j-2] + W[j] = (a>>7 ~ a<<25 ~ a<<14 ~ a>>18 ~ a>>3) + (b<<15 ~ b>>17 ~ b<<13 ~ b>>19 ~ b>>10) + W[j-7] + W[j-16] + end + local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8 + for j = 1, 64 do + local z = (e>>6 ~ e<<26 ~ e>>11 ~ e<<21 ~ e>>25 ~ e<<7) + (g ~ e & (f ~ g)) + h + K[j] + W[j] + h = g + g = f + f = e + e = z + d + d = c + c = b + b = a + a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a<<30 ~ a>>13 ~ a<<19 ~ a<<10 ~ a>>22) + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + h5 = e + h5 + h6 = f + h6 + h7 = g + h7 + h8 = h + h8 + end + H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8 + end + + local function sha512_feed_128(H_lo, H_hi, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k] + local floor, W, K_lo, K_hi = floor, common_W, sha2_K_lo, sha2_K_hi + local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] + local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] + for pos = offs + 1, offs + size, 128 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16], + W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] = + string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos) + for jj = 17*2, 80*2, 2 do + local a_lo, a_hi, b_lo, b_hi = W[jj-30], W[jj-31], W[jj-4], W[jj-5] + local tmp = + (a_lo>>1 ~ a_hi<<31 ~ a_lo>>8 ~ a_hi<<24 ~ a_lo>>7 ~ a_hi<<25) % 2^32 + + (b_lo>>19 ~ b_hi<<13 ~ b_lo<<3 ~ b_hi>>29 ~ b_lo>>6 ~ b_hi<<26) % 2^32 + + W[jj-14] % 2^32 + W[jj-32] % 2^32 + W[jj-1] = + (a_hi>>1 ~ a_lo<<31 ~ a_hi>>8 ~ a_lo<<24 ~ a_hi>>7) + + (b_hi>>19 ~ b_lo<<13 ~ b_hi<<3 ~ b_lo>>29 ~ b_hi>>6) + + W[jj-15] + W[jj-33] + floor(tmp / 2^32) + W[jj] = 0|((tmp + 2^31) % 2^32 - 2^31) + end + local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo + local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi + for j = 1, 80 do + local jj = 2*j + local z_lo = (e_lo>>14 ~ e_hi<<18 ~ e_lo>>18 ~ e_hi<<14 ~ e_lo<<23 ~ e_hi>>9) % 2^32 + (g_lo ~ e_lo & (f_lo ~ g_lo)) % 2^32 + h_lo % 2^32 + K_lo[j] + W[jj] % 2^32 + local z_hi = (e_hi>>14 ~ e_lo<<18 ~ e_hi>>18 ~ e_lo<<14 ~ e_hi<<23 ~ e_lo>>9) + (g_hi ~ e_hi & (f_hi ~ g_hi)) + h_hi + K_hi[j] + W[jj-1] + floor(z_lo / 2^32) + z_lo = z_lo % 2^32 + h_lo = g_lo + h_hi = g_hi + g_lo = f_lo + g_hi = f_hi + f_lo = e_lo + f_hi = e_hi + e_lo = z_lo + d_lo % 2^32 + e_hi = z_hi + d_hi + floor(e_lo / 2^32) + e_lo = 0|((e_lo + 2^31) % 2^32 - 2^31) + d_lo = c_lo + d_hi = c_hi + c_lo = b_lo + c_hi = b_hi + b_lo = a_lo + b_hi = a_hi + z_lo = z_lo + (d_lo & c_lo ~ b_lo & (d_lo ~ c_lo)) % 2^32 + (b_lo>>28 ~ b_hi<<4 ~ b_lo<<30 ~ b_hi>>2 ~ b_lo<<25 ~ b_hi>>7) % 2^32 + a_hi = z_hi + (d_hi & c_hi ~ b_hi & (d_hi ~ c_hi)) + (b_hi>>28 ~ b_lo<<4 ~ b_hi<<30 ~ b_lo>>2 ~ b_hi<<25 ~ b_lo>>7) + floor(z_lo / 2^32) + a_lo = 0|((z_lo + 2^31) % 2^32 - 2^31) + end + a_lo = h1_lo % 2^32 + a_lo % 2^32 + h1_hi = h1_hi + a_hi + floor(a_lo / 2^32) + h1_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h2_lo % 2^32 + b_lo % 2^32 + h2_hi = h2_hi + b_hi + floor(a_lo / 2^32) + h2_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h3_lo % 2^32 + c_lo % 2^32 + h3_hi = h3_hi + c_hi + floor(a_lo / 2^32) + h3_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h4_lo % 2^32 + d_lo % 2^32 + h4_hi = h4_hi + d_hi + floor(a_lo / 2^32) + h4_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h5_lo % 2^32 + e_lo % 2^32 + h5_hi = h5_hi + e_hi + floor(a_lo / 2^32) + h5_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h6_lo % 2^32 + f_lo % 2^32 + h6_hi = h6_hi + f_hi + floor(a_lo / 2^32) + h6_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h7_lo % 2^32 + g_lo % 2^32 + h7_hi = h7_hi + g_hi + floor(a_lo / 2^32) + h7_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + a_lo = h8_lo % 2^32 + h_lo % 2^32 + h8_hi = h8_hi + h_hi + floor(a_lo / 2^32) + h8_lo = 0|((a_lo + 2^31) % 2^32 - 2^31) + end + H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo + H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi + end + + local function md5_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K, md5_next_shift = common_W, md5_K, md5_next_shift + local h1, h2, h3, h4 = H[1], H[2], H[3], H[4] + for pos = offs + 1, offs + size, 64 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack(">s) + b + s = md5_next_shift[s] + end + s = 32-5 + for j = 17, 32 do + local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1] + a = d + d = c + c = b + b = (F << 32-s | F>>s) + b + s = md5_next_shift[s] + end + s = 32-4 + for j = 33, 48 do + local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1] + a = d + d = c + c = b + b = (F << 32-s | F>>s) + b + s = md5_next_shift[s] + end + s = 32-6 + for j = 49, 64 do + local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1] + a = d + d = c + c = b + b = (F << 32-s | F>>s) + b + s = md5_next_shift[s] + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + end + H[1], H[2], H[3], H[4] = h1, h2, h3, h4 + end + + local function sha1_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W = common_W + local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5] + for pos = offs + 1, offs + size, 64 do + W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] = + string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos) + for j = 17, 80 do + local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16] + W[j] = a << 1 ~ a >> 31 + end + local a, b, c, d, e = h1, h2, h3, h4, h5 + for j = 1, 20 do + local z = (a << 5 ~ a >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2)) + e = d + d = c + c = b << 30 ~ b >> 2 + b = a + a = z + end + for j = 21, 40 do + local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3) + e = d + d = c + c = b << 30 ~ b >> 2 + b = a + a = z + end + for j = 41, 60 do + local z = (a << 5 ~ a >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5) + e = d + d = c + c = b << 30 ~ b >> 2 + b = a + a = z + end + for j = 61, 80 do + local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10) + e = d + d = c + c = b << 30 ~ b >> 2 + b = a + a = z + end + h1 = a + h1 + h2 = b + h2 + h3 = c + h3 + h4 = d + h4 + h5 = e + h5 + end + H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5 + end + + local keccak_format_i4i4 = build_keccak_format("i4i4") + + local function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes) + -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8 + local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi + local qwords_qty = block_size_in_bytes / 8 + local keccak_format = keccak_format_i4i4[qwords_qty] + for pos = offs + 1, offs + size, block_size_in_bytes do + local dwords_from_message = {string_unpack(keccak_format, str, pos)} + for j = 1, qwords_qty do + lanes_lo[j] = lanes_lo[j] ~ dwords_from_message[2*j-1] + lanes_hi[j] = lanes_hi[j] ~ dwords_from_message[2*j] + end + local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi, + L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi, + L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi = + lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5], + lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10], + lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15], + lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20], + lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25] + for round_idx = 1, 24 do + local C1_lo = L01_lo ~ L06_lo ~ L11_lo ~ L16_lo ~ L21_lo + local C1_hi = L01_hi ~ L06_hi ~ L11_hi ~ L16_hi ~ L21_hi + local C2_lo = L02_lo ~ L07_lo ~ L12_lo ~ L17_lo ~ L22_lo + local C2_hi = L02_hi ~ L07_hi ~ L12_hi ~ L17_hi ~ L22_hi + local C3_lo = L03_lo ~ L08_lo ~ L13_lo ~ L18_lo ~ L23_lo + local C3_hi = L03_hi ~ L08_hi ~ L13_hi ~ L18_hi ~ L23_hi + local C4_lo = L04_lo ~ L09_lo ~ L14_lo ~ L19_lo ~ L24_lo + local C4_hi = L04_hi ~ L09_hi ~ L14_hi ~ L19_hi ~ L24_hi + local C5_lo = L05_lo ~ L10_lo ~ L15_lo ~ L20_lo ~ L25_lo + local C5_hi = L05_hi ~ L10_hi ~ L15_hi ~ L20_hi ~ L25_hi + local D_lo = C1_lo ~ C3_lo<<1 ~ C3_hi>>31 + local D_hi = C1_hi ~ C3_hi<<1 ~ C3_lo>>31 + local T0_lo = D_lo ~ L02_lo + local T0_hi = D_hi ~ L02_hi + local T1_lo = D_lo ~ L07_lo + local T1_hi = D_hi ~ L07_hi + local T2_lo = D_lo ~ L12_lo + local T2_hi = D_hi ~ L12_hi + local T3_lo = D_lo ~ L17_lo + local T3_hi = D_hi ~ L17_hi + local T4_lo = D_lo ~ L22_lo + local T4_hi = D_hi ~ L22_hi + L02_lo = T1_lo>>20 ~ T1_hi<<12 + L02_hi = T1_hi>>20 ~ T1_lo<<12 + L07_lo = T3_lo>>19 ~ T3_hi<<13 + L07_hi = T3_hi>>19 ~ T3_lo<<13 + L12_lo = T0_lo<<1 ~ T0_hi>>31 + L12_hi = T0_hi<<1 ~ T0_lo>>31 + L17_lo = T2_lo<<10 ~ T2_hi>>22 + L17_hi = T2_hi<<10 ~ T2_lo>>22 + L22_lo = T4_lo<<2 ~ T4_hi>>30 + L22_hi = T4_hi<<2 ~ T4_lo>>30 + D_lo = C2_lo ~ C4_lo<<1 ~ C4_hi>>31 + D_hi = C2_hi ~ C4_hi<<1 ~ C4_lo>>31 + T0_lo = D_lo ~ L03_lo + T0_hi = D_hi ~ L03_hi + T1_lo = D_lo ~ L08_lo + T1_hi = D_hi ~ L08_hi + T2_lo = D_lo ~ L13_lo + T2_hi = D_hi ~ L13_hi + T3_lo = D_lo ~ L18_lo + T3_hi = D_hi ~ L18_hi + T4_lo = D_lo ~ L23_lo + T4_hi = D_hi ~ L23_hi + L03_lo = T2_lo>>21 ~ T2_hi<<11 + L03_hi = T2_hi>>21 ~ T2_lo<<11 + L08_lo = T4_lo>>3 ~ T4_hi<<29 + L08_hi = T4_hi>>3 ~ T4_lo<<29 + L13_lo = T1_lo<<6 ~ T1_hi>>26 + L13_hi = T1_hi<<6 ~ T1_lo>>26 + L18_lo = T3_lo<<15 ~ T3_hi>>17 + L18_hi = T3_hi<<15 ~ T3_lo>>17 + L23_lo = T0_lo>>2 ~ T0_hi<<30 + L23_hi = T0_hi>>2 ~ T0_lo<<30 + D_lo = C3_lo ~ C5_lo<<1 ~ C5_hi>>31 + D_hi = C3_hi ~ C5_hi<<1 ~ C5_lo>>31 + T0_lo = D_lo ~ L04_lo + T0_hi = D_hi ~ L04_hi + T1_lo = D_lo ~ L09_lo + T1_hi = D_hi ~ L09_hi + T2_lo = D_lo ~ L14_lo + T2_hi = D_hi ~ L14_hi + T3_lo = D_lo ~ L19_lo + T3_hi = D_hi ~ L19_hi + T4_lo = D_lo ~ L24_lo + T4_hi = D_hi ~ L24_hi + L04_lo = T3_lo<<21 ~ T3_hi>>11 + L04_hi = T3_hi<<21 ~ T3_lo>>11 + L09_lo = T0_lo<<28 ~ T0_hi>>4 + L09_hi = T0_hi<<28 ~ T0_lo>>4 + L14_lo = T2_lo<<25 ~ T2_hi>>7 + L14_hi = T2_hi<<25 ~ T2_lo>>7 + L19_lo = T4_lo>>8 ~ T4_hi<<24 + L19_hi = T4_hi>>8 ~ T4_lo<<24 + L24_lo = T1_lo>>9 ~ T1_hi<<23 + L24_hi = T1_hi>>9 ~ T1_lo<<23 + D_lo = C4_lo ~ C1_lo<<1 ~ C1_hi>>31 + D_hi = C4_hi ~ C1_hi<<1 ~ C1_lo>>31 + T0_lo = D_lo ~ L05_lo + T0_hi = D_hi ~ L05_hi + T1_lo = D_lo ~ L10_lo + T1_hi = D_hi ~ L10_hi + T2_lo = D_lo ~ L15_lo + T2_hi = D_hi ~ L15_hi + T3_lo = D_lo ~ L20_lo + T3_hi = D_hi ~ L20_hi + T4_lo = D_lo ~ L25_lo + T4_hi = D_hi ~ L25_hi + L05_lo = T4_lo<<14 ~ T4_hi>>18 + L05_hi = T4_hi<<14 ~ T4_lo>>18 + L10_lo = T1_lo<<20 ~ T1_hi>>12 + L10_hi = T1_hi<<20 ~ T1_lo>>12 + L15_lo = T3_lo<<8 ~ T3_hi>>24 + L15_hi = T3_hi<<8 ~ T3_lo>>24 + L20_lo = T0_lo<<27 ~ T0_hi>>5 + L20_hi = T0_hi<<27 ~ T0_lo>>5 + L25_lo = T2_lo>>25 ~ T2_hi<<7 + L25_hi = T2_hi>>25 ~ T2_lo<<7 + D_lo = C5_lo ~ C2_lo<<1 ~ C2_hi>>31 + D_hi = C5_hi ~ C2_hi<<1 ~ C2_lo>>31 + T1_lo = D_lo ~ L06_lo + T1_hi = D_hi ~ L06_hi + T2_lo = D_lo ~ L11_lo + T2_hi = D_hi ~ L11_hi + T3_lo = D_lo ~ L16_lo + T3_hi = D_hi ~ L16_hi + T4_lo = D_lo ~ L21_lo + T4_hi = D_hi ~ L21_hi + L06_lo = T2_lo<<3 ~ T2_hi>>29 + L06_hi = T2_hi<<3 ~ T2_lo>>29 + L11_lo = T4_lo<<18 ~ T4_hi>>14 + L11_hi = T4_hi<<18 ~ T4_lo>>14 + L16_lo = T1_lo>>28 ~ T1_hi<<4 + L16_hi = T1_hi>>28 ~ T1_lo<<4 + L21_lo = T3_lo>>23 ~ T3_hi<<9 + L21_hi = T3_hi>>23 ~ T3_lo<<9 + L01_lo = D_lo ~ L01_lo + L01_hi = D_hi ~ L01_hi + L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = L01_lo ~ ~L02_lo & L03_lo, L02_lo ~ ~L03_lo & L04_lo, L03_lo ~ ~L04_lo & L05_lo, L04_lo ~ ~L05_lo & L01_lo, L05_lo ~ ~L01_lo & L02_lo + L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = L01_hi ~ ~L02_hi & L03_hi, L02_hi ~ ~L03_hi & L04_hi, L03_hi ~ ~L04_hi & L05_hi, L04_hi ~ ~L05_hi & L01_hi, L05_hi ~ ~L01_hi & L02_hi + L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = L09_lo ~ ~L10_lo & L06_lo, L10_lo ~ ~L06_lo & L07_lo, L06_lo ~ ~L07_lo & L08_lo, L07_lo ~ ~L08_lo & L09_lo, L08_lo ~ ~L09_lo & L10_lo + L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = L09_hi ~ ~L10_hi & L06_hi, L10_hi ~ ~L06_hi & L07_hi, L06_hi ~ ~L07_hi & L08_hi, L07_hi ~ ~L08_hi & L09_hi, L08_hi ~ ~L09_hi & L10_hi + L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = L12_lo ~ ~L13_lo & L14_lo, L13_lo ~ ~L14_lo & L15_lo, L14_lo ~ ~L15_lo & L11_lo, L15_lo ~ ~L11_lo & L12_lo, L11_lo ~ ~L12_lo & L13_lo + L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = L12_hi ~ ~L13_hi & L14_hi, L13_hi ~ ~L14_hi & L15_hi, L14_hi ~ ~L15_hi & L11_hi, L15_hi ~ ~L11_hi & L12_hi, L11_hi ~ ~L12_hi & L13_hi + L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = L20_lo ~ ~L16_lo & L17_lo, L16_lo ~ ~L17_lo & L18_lo, L17_lo ~ ~L18_lo & L19_lo, L18_lo ~ ~L19_lo & L20_lo, L19_lo ~ ~L20_lo & L16_lo + L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = L20_hi ~ ~L16_hi & L17_hi, L16_hi ~ ~L17_hi & L18_hi, L17_hi ~ ~L18_hi & L19_hi, L18_hi ~ ~L19_hi & L20_hi, L19_hi ~ ~L20_hi & L16_hi + L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = L23_lo ~ ~L24_lo & L25_lo, L24_lo ~ ~L25_lo & L21_lo, L25_lo ~ ~L21_lo & L22_lo, L21_lo ~ ~L22_lo & L23_lo, L22_lo ~ ~L23_lo & L24_lo + L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = L23_hi ~ ~L24_hi & L25_hi, L24_hi ~ ~L25_hi & L21_hi, L25_hi ~ ~L21_hi & L22_hi, L21_hi ~ ~L22_hi & L23_hi, L22_hi ~ ~L23_hi & L24_hi + L01_lo = L01_lo ~ RC_lo[round_idx] + L01_hi = L01_hi ~ RC_hi[round_idx] + end + lanes_lo[1] = L01_lo + lanes_hi[1] = L01_hi + lanes_lo[2] = L02_lo + lanes_hi[2] = L02_hi + lanes_lo[3] = L03_lo + lanes_hi[3] = L03_hi + lanes_lo[4] = L04_lo + lanes_hi[4] = L04_hi + lanes_lo[5] = L05_lo + lanes_hi[5] = L05_hi + lanes_lo[6] = L06_lo + lanes_hi[6] = L06_hi + lanes_lo[7] = L07_lo + lanes_hi[7] = L07_hi + lanes_lo[8] = L08_lo + lanes_hi[8] = L08_hi + lanes_lo[9] = L09_lo + lanes_hi[9] = L09_hi + lanes_lo[10] = L10_lo + lanes_hi[10] = L10_hi + lanes_lo[11] = L11_lo + lanes_hi[11] = L11_hi + lanes_lo[12] = L12_lo + lanes_hi[12] = L12_hi + lanes_lo[13] = L13_lo + lanes_hi[13] = L13_hi + lanes_lo[14] = L14_lo + lanes_hi[14] = L14_hi + lanes_lo[15] = L15_lo + lanes_hi[15] = L15_hi + lanes_lo[16] = L16_lo + lanes_hi[16] = L16_hi + lanes_lo[17] = L17_lo + lanes_hi[17] = L17_hi + lanes_lo[18] = L18_lo + lanes_hi[18] = L18_hi + lanes_lo[19] = L19_lo + lanes_hi[19] = L19_hi + lanes_lo[20] = L20_lo + lanes_hi[20] = L20_hi + lanes_lo[21] = L21_lo + lanes_hi[21] = L21_hi + lanes_lo[22] = L22_lo + lanes_hi[22] = L22_hi + lanes_lo[23] = L23_lo + lanes_hi[23] = L23_hi + lanes_lo[24] = L24_lo + lanes_hi[24] = L24_hi + lanes_lo[25] = L25_lo + lanes_hi[25] = L25_hi + end + end + + return XOR32A5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed + ]](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi) + +end + + +if branch == "LIB32" or branch == "EMUL" then + + + -- implementation for Lua 5.1/5.2 (with or without bitwise library available) + + function sha256_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K = common_W, sha2_K_hi + local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] + for pos = offs, offs + size - 1, 64 do + for j = 1, 16 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = ((a * 256 + b) * 256 + c) * 256 + d + end + for j = 17, 64 do + local a, b = W[j-15], W[j-2] + W[j] = XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16] + end + local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8 + for j = 1, 64 do + local z = XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + AND(e, f) + AND(-1-e, g) + h + K[j] + W[j] + h = g + g = f + f = e + e = z + d + d = c + c = b + b = a + a = z + AND(d, c) + AND(a, XOR(d, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + end + h1, h2, h3, h4 = (a + h1) % 4294967296, (b + h2) % 4294967296, (c + h3) % 4294967296, (d + h4) % 4294967296 + h5, h6, h7, h8 = (e + h5) % 4294967296, (f + h6) % 4294967296, (g + h7) % 4294967296, (h + h8) % 4294967296 + end + H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8 + end + + function sha512_feed_128(H_lo, H_hi, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 128 + -- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k] + local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi + local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] + local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] + for pos = offs, offs + size - 1, 128 do + for j = 1, 16*2 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = ((a * 256 + b) * 256 + c) * 256 + d + end + for jj = 17*2, 80*2, 2 do + local a_lo, a_hi, b_lo, b_hi = W[jj-30], W[jj-31], W[jj-4], W[jj-5] + local tmp1 = XOR(SHR(a_lo, 1) + SHL(a_hi, 31), SHR(a_lo, 8) + SHL(a_hi, 24), SHR(a_lo, 7) + SHL(a_hi, 25)) % 4294967296 + XOR(SHR(b_lo, 19) + SHL(b_hi, 13), SHL(b_lo, 3) + SHR(b_hi, 29), SHR(b_lo, 6) + SHL(b_hi, 26)) % 4294967296 + W[jj-14] + W[jj-32] + local tmp2 = tmp1 % 4294967296 + W[jj-1] = XOR(SHR(a_hi, 1) + SHL(a_lo, 31), SHR(a_hi, 8) + SHL(a_lo, 24), SHR(a_hi, 7)) + XOR(SHR(b_hi, 19) + SHL(b_lo, 13), SHL(b_hi, 3) + SHR(b_lo, 29), SHR(b_hi, 6)) + W[jj-15] + W[jj-33] + (tmp1 - tmp2) / 4294967296 + W[jj] = tmp2 + end + local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo + local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi + for j = 1, 80 do + local jj = 2*j + local tmp1 = XOR(SHR(e_lo, 14) + SHL(e_hi, 18), SHR(e_lo, 18) + SHL(e_hi, 14), SHL(e_lo, 23) + SHR(e_hi, 9)) % 4294967296 + (AND(e_lo, f_lo) + AND(-1-e_lo, g_lo)) % 4294967296 + h_lo + K_lo[j] + W[jj] + local z_lo = tmp1 % 4294967296 + local z_hi = XOR(SHR(e_hi, 14) + SHL(e_lo, 18), SHR(e_hi, 18) + SHL(e_lo, 14), SHL(e_hi, 23) + SHR(e_lo, 9)) + AND(e_hi, f_hi) + AND(-1-e_hi, g_hi) + h_hi + K_hi[j] + W[jj-1] + (tmp1 - z_lo) / 4294967296 + h_lo = g_lo + h_hi = g_hi + g_lo = f_lo + g_hi = f_hi + f_lo = e_lo + f_hi = e_hi + tmp1 = z_lo + d_lo + e_lo = tmp1 % 4294967296 + e_hi = z_hi + d_hi + (tmp1 - e_lo) / 4294967296 + d_lo = c_lo + d_hi = c_hi + c_lo = b_lo + c_hi = b_hi + b_lo = a_lo + b_hi = a_hi + tmp1 = z_lo + (AND(d_lo, c_lo) + AND(b_lo, XOR(d_lo, c_lo))) % 4294967296 + XOR(SHR(b_lo, 28) + SHL(b_hi, 4), SHL(b_lo, 30) + SHR(b_hi, 2), SHL(b_lo, 25) + SHR(b_hi, 7)) % 4294967296 + a_lo = tmp1 % 4294967296 + a_hi = z_hi + (AND(d_hi, c_hi) + AND(b_hi, XOR(d_hi, c_hi))) + XOR(SHR(b_hi, 28) + SHL(b_lo, 4), SHL(b_hi, 30) + SHR(b_lo, 2), SHL(b_hi, 25) + SHR(b_lo, 7)) + (tmp1 - a_lo) / 4294967296 + end + a_lo = h1_lo + a_lo + h1_lo = a_lo % 4294967296 + h1_hi = (h1_hi + a_hi + (a_lo - h1_lo) / 4294967296) % 4294967296 + a_lo = h2_lo + b_lo + h2_lo = a_lo % 4294967296 + h2_hi = (h2_hi + b_hi + (a_lo - h2_lo) / 4294967296) % 4294967296 + a_lo = h3_lo + c_lo + h3_lo = a_lo % 4294967296 + h3_hi = (h3_hi + c_hi + (a_lo - h3_lo) / 4294967296) % 4294967296 + a_lo = h4_lo + d_lo + h4_lo = a_lo % 4294967296 + h4_hi = (h4_hi + d_hi + (a_lo - h4_lo) / 4294967296) % 4294967296 + a_lo = h5_lo + e_lo + h5_lo = a_lo % 4294967296 + h5_hi = (h5_hi + e_hi + (a_lo - h5_lo) / 4294967296) % 4294967296 + a_lo = h6_lo + f_lo + h6_lo = a_lo % 4294967296 + h6_hi = (h6_hi + f_hi + (a_lo - h6_lo) / 4294967296) % 4294967296 + a_lo = h7_lo + g_lo + h7_lo = a_lo % 4294967296 + h7_hi = (h7_hi + g_hi + (a_lo - h7_lo) / 4294967296) % 4294967296 + a_lo = h8_lo + h_lo + h8_lo = a_lo % 4294967296 + h8_hi = (h8_hi + h_hi + (a_lo - h8_lo) / 4294967296) % 4294967296 + end + H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo + H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi + end + + function md5_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W, K, md5_next_shift = common_W, md5_K, md5_next_shift + local h1, h2, h3, h4 = H[1], H[2], H[3], H[4] + for pos = offs, offs + size - 1, 64 do + for j = 1, 16 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = ((d * 256 + c) * 256 + b) * 256 + a + end + local a, b, c, d = h1, h2, h3, h4 + local s = 32-7 + for j = 1, 16 do + local F = ROR(AND(b, c) + AND(-1-b, d) + a + K[j] + W[j], s) + b + s = md5_next_shift[s] + a = d + d = c + c = b + b = F + end + s = 32-5 + for j = 17, 32 do + local F = ROR(AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1], s) + b + s = md5_next_shift[s] + a = d + d = c + c = b + b = F + end + s = 32-4 + for j = 33, 48 do + local F = ROR(XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1], s) + b + s = md5_next_shift[s] + a = d + d = c + c = b + b = F + end + s = 32-6 + for j = 49, 64 do + local F = ROR(XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1], s) + b + s = md5_next_shift[s] + a = d + d = c + c = b + b = F + end + h1 = (a + h1) % 4294967296 + h2 = (b + h2) % 4294967296 + h3 = (c + h3) % 4294967296 + h4 = (d + h4) % 4294967296 + end + H[1], H[2], H[3], H[4] = h1, h2, h3, h4 + end + + function sha1_feed_64(H, str, offs, size) + -- offs >= 0, size >= 0, size is multiple of 64 + local W = common_W + local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5] + for pos = offs, offs + size - 1, 64 do + for j = 1, 16 do + pos = pos + 4 + local a, b, c, d = byte(str, pos - 3, pos) + W[j] = ((a * 256 + b) * 256 + c) * 256 + d + end + for j = 17, 80 do + W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1) + end + local a, b, c, d, e = h1, h2, h3, h4, h5 + for j = 1, 20 do + local z = ROL(a, 5) + AND(b, c) + AND(-1-b, d) + 0x5A827999 + W[j] + e -- constant = floor(2^30 * sqrt(2)) + e = d + d = c + c = ROR(b, 2) + b = a + a = z + end + for j = 21, 40 do + local z = ROL(a, 5) + XOR(b, c, d) + 0x6ED9EBA1 + W[j] + e -- 2^30 * sqrt(3) + e = d + d = c + c = ROR(b, 2) + b = a + a = z + end + for j = 41, 60 do + local z = ROL(a, 5) + AND(d, c) + AND(b, XOR(d, c)) + 0x8F1BBCDC + W[j] + e -- 2^30 * sqrt(5) + e = d + d = c + c = ROR(b, 2) + b = a + a = z + end + for j = 61, 80 do + local z = ROL(a, 5) + XOR(b, c, d) + 0xCA62C1D6 + W[j] + e -- 2^30 * sqrt(10) + e = d + d = c + c = ROR(b, 2) + b = a + a = z + end + h1 = (a + h1) % 4294967296 + h2 = (b + h2) % 4294967296 + h3 = (c + h3) % 4294967296 + h4 = (d + h4) % 4294967296 + h5 = (e + h5) % 4294967296 + end + H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5 + end + + function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes) + -- This is an example of a Lua function having 79 local variables :-) + -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8 + local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi + local qwords_qty = block_size_in_bytes / 8 + for pos = offs, offs + size - 1, block_size_in_bytes do + for j = 1, qwords_qty do + local a, b, c, d = byte(str, pos + 1, pos + 4) + lanes_lo[j] = XOR(lanes_lo[j], ((d * 256 + c) * 256 + b) * 256 + a) + pos = pos + 8 + a, b, c, d = byte(str, pos - 3, pos) + lanes_hi[j] = XOR(lanes_hi[j], ((d * 256 + c) * 256 + b) * 256 + a) + end + local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi, + L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi, + L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi = + lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5], + lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10], + lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15], + lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20], + lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25] + for round_idx = 1, 24 do + local C1_lo = XOR(L01_lo, L06_lo, L11_lo, L16_lo, L21_lo) + local C1_hi = XOR(L01_hi, L06_hi, L11_hi, L16_hi, L21_hi) + local C2_lo = XOR(L02_lo, L07_lo, L12_lo, L17_lo, L22_lo) + local C2_hi = XOR(L02_hi, L07_hi, L12_hi, L17_hi, L22_hi) + local C3_lo = XOR(L03_lo, L08_lo, L13_lo, L18_lo, L23_lo) + local C3_hi = XOR(L03_hi, L08_hi, L13_hi, L18_hi, L23_hi) + local C4_lo = XOR(L04_lo, L09_lo, L14_lo, L19_lo, L24_lo) + local C4_hi = XOR(L04_hi, L09_hi, L14_hi, L19_hi, L24_hi) + local C5_lo = XOR(L05_lo, L10_lo, L15_lo, L20_lo, L25_lo) + local C5_hi = XOR(L05_hi, L10_hi, L15_hi, L20_hi, L25_hi) + local D_lo = XOR(C1_lo, C3_lo * 2 + (C3_hi % 2^32 - C3_hi % 2^31) / 2^31) + local D_hi = XOR(C1_hi, C3_hi * 2 + (C3_lo % 2^32 - C3_lo % 2^31) / 2^31) + local T0_lo = XOR(D_lo, L02_lo) + local T0_hi = XOR(D_hi, L02_hi) + local T1_lo = XOR(D_lo, L07_lo) + local T1_hi = XOR(D_hi, L07_hi) + local T2_lo = XOR(D_lo, L12_lo) + local T2_hi = XOR(D_hi, L12_hi) + local T3_lo = XOR(D_lo, L17_lo) + local T3_hi = XOR(D_hi, L17_hi) + local T4_lo = XOR(D_lo, L22_lo) + local T4_hi = XOR(D_hi, L22_hi) + L02_lo = (T1_lo % 2^32 - T1_lo % 2^20) / 2^20 + T1_hi * 2^12 + L02_hi = (T1_hi % 2^32 - T1_hi % 2^20) / 2^20 + T1_lo * 2^12 + L07_lo = (T3_lo % 2^32 - T3_lo % 2^19) / 2^19 + T3_hi * 2^13 + L07_hi = (T3_hi % 2^32 - T3_hi % 2^19) / 2^19 + T3_lo * 2^13 + L12_lo = T0_lo * 2 + (T0_hi % 2^32 - T0_hi % 2^31) / 2^31 + L12_hi = T0_hi * 2 + (T0_lo % 2^32 - T0_lo % 2^31) / 2^31 + L17_lo = T2_lo * 2^10 + (T2_hi % 2^32 - T2_hi % 2^22) / 2^22 + L17_hi = T2_hi * 2^10 + (T2_lo % 2^32 - T2_lo % 2^22) / 2^22 + L22_lo = T4_lo * 2^2 + (T4_hi % 2^32 - T4_hi % 2^30) / 2^30 + L22_hi = T4_hi * 2^2 + (T4_lo % 2^32 - T4_lo % 2^30) / 2^30 + D_lo = XOR(C2_lo, C4_lo * 2 + (C4_hi % 2^32 - C4_hi % 2^31) / 2^31) + D_hi = XOR(C2_hi, C4_hi * 2 + (C4_lo % 2^32 - C4_lo % 2^31) / 2^31) + T0_lo = XOR(D_lo, L03_lo) + T0_hi = XOR(D_hi, L03_hi) + T1_lo = XOR(D_lo, L08_lo) + T1_hi = XOR(D_hi, L08_hi) + T2_lo = XOR(D_lo, L13_lo) + T2_hi = XOR(D_hi, L13_hi) + T3_lo = XOR(D_lo, L18_lo) + T3_hi = XOR(D_hi, L18_hi) + T4_lo = XOR(D_lo, L23_lo) + T4_hi = XOR(D_hi, L23_hi) + L03_lo = (T2_lo % 2^32 - T2_lo % 2^21) / 2^21 + T2_hi * 2^11 + L03_hi = (T2_hi % 2^32 - T2_hi % 2^21) / 2^21 + T2_lo * 2^11 + L08_lo = (T4_lo % 2^32 - T4_lo % 2^3) / 2^3 + T4_hi * 2^29 % 2^32 + L08_hi = (T4_hi % 2^32 - T4_hi % 2^3) / 2^3 + T4_lo * 2^29 % 2^32 + L13_lo = T1_lo * 2^6 + (T1_hi % 2^32 - T1_hi % 2^26) / 2^26 + L13_hi = T1_hi * 2^6 + (T1_lo % 2^32 - T1_lo % 2^26) / 2^26 + L18_lo = T3_lo * 2^15 + (T3_hi % 2^32 - T3_hi % 2^17) / 2^17 + L18_hi = T3_hi * 2^15 + (T3_lo % 2^32 - T3_lo % 2^17) / 2^17 + L23_lo = (T0_lo % 2^32 - T0_lo % 2^2) / 2^2 + T0_hi * 2^30 % 2^32 + L23_hi = (T0_hi % 2^32 - T0_hi % 2^2) / 2^2 + T0_lo * 2^30 % 2^32 + D_lo = XOR(C3_lo, C5_lo * 2 + (C5_hi % 2^32 - C5_hi % 2^31) / 2^31) + D_hi = XOR(C3_hi, C5_hi * 2 + (C5_lo % 2^32 - C5_lo % 2^31) / 2^31) + T0_lo = XOR(D_lo, L04_lo) + T0_hi = XOR(D_hi, L04_hi) + T1_lo = XOR(D_lo, L09_lo) + T1_hi = XOR(D_hi, L09_hi) + T2_lo = XOR(D_lo, L14_lo) + T2_hi = XOR(D_hi, L14_hi) + T3_lo = XOR(D_lo, L19_lo) + T3_hi = XOR(D_hi, L19_hi) + T4_lo = XOR(D_lo, L24_lo) + T4_hi = XOR(D_hi, L24_hi) + L04_lo = T3_lo * 2^21 % 2^32 + (T3_hi % 2^32 - T3_hi % 2^11) / 2^11 + L04_hi = T3_hi * 2^21 % 2^32 + (T3_lo % 2^32 - T3_lo % 2^11) / 2^11 + L09_lo = T0_lo * 2^28 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^4) / 2^4 + L09_hi = T0_hi * 2^28 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^4) / 2^4 + L14_lo = T2_lo * 2^25 % 2^32 + (T2_hi % 2^32 - T2_hi % 2^7) / 2^7 + L14_hi = T2_hi * 2^25 % 2^32 + (T2_lo % 2^32 - T2_lo % 2^7) / 2^7 + L19_lo = (T4_lo % 2^32 - T4_lo % 2^8) / 2^8 + T4_hi * 2^24 % 2^32 + L19_hi = (T4_hi % 2^32 - T4_hi % 2^8) / 2^8 + T4_lo * 2^24 % 2^32 + L24_lo = (T1_lo % 2^32 - T1_lo % 2^9) / 2^9 + T1_hi * 2^23 % 2^32 + L24_hi = (T1_hi % 2^32 - T1_hi % 2^9) / 2^9 + T1_lo * 2^23 % 2^32 + D_lo = XOR(C4_lo, C1_lo * 2 + (C1_hi % 2^32 - C1_hi % 2^31) / 2^31) + D_hi = XOR(C4_hi, C1_hi * 2 + (C1_lo % 2^32 - C1_lo % 2^31) / 2^31) + T0_lo = XOR(D_lo, L05_lo) + T0_hi = XOR(D_hi, L05_hi) + T1_lo = XOR(D_lo, L10_lo) + T1_hi = XOR(D_hi, L10_hi) + T2_lo = XOR(D_lo, L15_lo) + T2_hi = XOR(D_hi, L15_hi) + T3_lo = XOR(D_lo, L20_lo) + T3_hi = XOR(D_hi, L20_hi) + T4_lo = XOR(D_lo, L25_lo) + T4_hi = XOR(D_hi, L25_hi) + L05_lo = T4_lo * 2^14 + (T4_hi % 2^32 - T4_hi % 2^18) / 2^18 + L05_hi = T4_hi * 2^14 + (T4_lo % 2^32 - T4_lo % 2^18) / 2^18 + L10_lo = T1_lo * 2^20 % 2^32 + (T1_hi % 2^32 - T1_hi % 2^12) / 2^12 + L10_hi = T1_hi * 2^20 % 2^32 + (T1_lo % 2^32 - T1_lo % 2^12) / 2^12 + L15_lo = T3_lo * 2^8 + (T3_hi % 2^32 - T3_hi % 2^24) / 2^24 + L15_hi = T3_hi * 2^8 + (T3_lo % 2^32 - T3_lo % 2^24) / 2^24 + L20_lo = T0_lo * 2^27 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^5) / 2^5 + L20_hi = T0_hi * 2^27 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^5) / 2^5 + L25_lo = (T2_lo % 2^32 - T2_lo % 2^25) / 2^25 + T2_hi * 2^7 + L25_hi = (T2_hi % 2^32 - T2_hi % 2^25) / 2^25 + T2_lo * 2^7 + D_lo = XOR(C5_lo, C2_lo * 2 + (C2_hi % 2^32 - C2_hi % 2^31) / 2^31) + D_hi = XOR(C5_hi, C2_hi * 2 + (C2_lo % 2^32 - C2_lo % 2^31) / 2^31) + T1_lo = XOR(D_lo, L06_lo) + T1_hi = XOR(D_hi, L06_hi) + T2_lo = XOR(D_lo, L11_lo) + T2_hi = XOR(D_hi, L11_hi) + T3_lo = XOR(D_lo, L16_lo) + T3_hi = XOR(D_hi, L16_hi) + T4_lo = XOR(D_lo, L21_lo) + T4_hi = XOR(D_hi, L21_hi) + L06_lo = T2_lo * 2^3 + (T2_hi % 2^32 - T2_hi % 2^29) / 2^29 + L06_hi = T2_hi * 2^3 + (T2_lo % 2^32 - T2_lo % 2^29) / 2^29 + L11_lo = T4_lo * 2^18 + (T4_hi % 2^32 - T4_hi % 2^14) / 2^14 + L11_hi = T4_hi * 2^18 + (T4_lo % 2^32 - T4_lo % 2^14) / 2^14 + L16_lo = (T1_lo % 2^32 - T1_lo % 2^28) / 2^28 + T1_hi * 2^4 + L16_hi = (T1_hi % 2^32 - T1_hi % 2^28) / 2^28 + T1_lo * 2^4 + L21_lo = (T3_lo % 2^32 - T3_lo % 2^23) / 2^23 + T3_hi * 2^9 + L21_hi = (T3_hi % 2^32 - T3_hi % 2^23) / 2^23 + T3_lo * 2^9 + L01_lo = XOR(D_lo, L01_lo) + L01_hi = XOR(D_hi, L01_hi) + L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = XOR(L01_lo, AND(-1-L02_lo, L03_lo)), XOR(L02_lo, AND(-1-L03_lo, L04_lo)), XOR(L03_lo, AND(-1-L04_lo, L05_lo)), XOR(L04_lo, AND(-1-L05_lo, L01_lo)), XOR(L05_lo, AND(-1-L01_lo, L02_lo)) + L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = XOR(L01_hi, AND(-1-L02_hi, L03_hi)), XOR(L02_hi, AND(-1-L03_hi, L04_hi)), XOR(L03_hi, AND(-1-L04_hi, L05_hi)), XOR(L04_hi, AND(-1-L05_hi, L01_hi)), XOR(L05_hi, AND(-1-L01_hi, L02_hi)) + L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = XOR(L09_lo, AND(-1-L10_lo, L06_lo)), XOR(L10_lo, AND(-1-L06_lo, L07_lo)), XOR(L06_lo, AND(-1-L07_lo, L08_lo)), XOR(L07_lo, AND(-1-L08_lo, L09_lo)), XOR(L08_lo, AND(-1-L09_lo, L10_lo)) + L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = XOR(L09_hi, AND(-1-L10_hi, L06_hi)), XOR(L10_hi, AND(-1-L06_hi, L07_hi)), XOR(L06_hi, AND(-1-L07_hi, L08_hi)), XOR(L07_hi, AND(-1-L08_hi, L09_hi)), XOR(L08_hi, AND(-1-L09_hi, L10_hi)) + L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = XOR(L12_lo, AND(-1-L13_lo, L14_lo)), XOR(L13_lo, AND(-1-L14_lo, L15_lo)), XOR(L14_lo, AND(-1-L15_lo, L11_lo)), XOR(L15_lo, AND(-1-L11_lo, L12_lo)), XOR(L11_lo, AND(-1-L12_lo, L13_lo)) + L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = XOR(L12_hi, AND(-1-L13_hi, L14_hi)), XOR(L13_hi, AND(-1-L14_hi, L15_hi)), XOR(L14_hi, AND(-1-L15_hi, L11_hi)), XOR(L15_hi, AND(-1-L11_hi, L12_hi)), XOR(L11_hi, AND(-1-L12_hi, L13_hi)) + L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = XOR(L20_lo, AND(-1-L16_lo, L17_lo)), XOR(L16_lo, AND(-1-L17_lo, L18_lo)), XOR(L17_lo, AND(-1-L18_lo, L19_lo)), XOR(L18_lo, AND(-1-L19_lo, L20_lo)), XOR(L19_lo, AND(-1-L20_lo, L16_lo)) + L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = XOR(L20_hi, AND(-1-L16_hi, L17_hi)), XOR(L16_hi, AND(-1-L17_hi, L18_hi)), XOR(L17_hi, AND(-1-L18_hi, L19_hi)), XOR(L18_hi, AND(-1-L19_hi, L20_hi)), XOR(L19_hi, AND(-1-L20_hi, L16_hi)) + L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = XOR(L23_lo, AND(-1-L24_lo, L25_lo)), XOR(L24_lo, AND(-1-L25_lo, L21_lo)), XOR(L25_lo, AND(-1-L21_lo, L22_lo)), XOR(L21_lo, AND(-1-L22_lo, L23_lo)), XOR(L22_lo, AND(-1-L23_lo, L24_lo)) + L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = XOR(L23_hi, AND(-1-L24_hi, L25_hi)), XOR(L24_hi, AND(-1-L25_hi, L21_hi)), XOR(L25_hi, AND(-1-L21_hi, L22_hi)), XOR(L21_hi, AND(-1-L22_hi, L23_hi)), XOR(L22_hi, AND(-1-L23_hi, L24_hi)) + L01_lo = XOR(L01_lo, RC_lo[round_idx]) + L01_hi = L01_hi + RC_hi[round_idx] -- RC_hi[] is either 0 or 0x80000000, so we could use fast addition instead of slow XOR + end + lanes_lo[1] = L01_lo + lanes_hi[1] = L01_hi + lanes_lo[2] = L02_lo + lanes_hi[2] = L02_hi + lanes_lo[3] = L03_lo + lanes_hi[3] = L03_hi + lanes_lo[4] = L04_lo + lanes_hi[4] = L04_hi + lanes_lo[5] = L05_lo + lanes_hi[5] = L05_hi + lanes_lo[6] = L06_lo + lanes_hi[6] = L06_hi + lanes_lo[7] = L07_lo + lanes_hi[7] = L07_hi + lanes_lo[8] = L08_lo + lanes_hi[8] = L08_hi + lanes_lo[9] = L09_lo + lanes_hi[9] = L09_hi + lanes_lo[10] = L10_lo + lanes_hi[10] = L10_hi + lanes_lo[11] = L11_lo + lanes_hi[11] = L11_hi + lanes_lo[12] = L12_lo + lanes_hi[12] = L12_hi + lanes_lo[13] = L13_lo + lanes_hi[13] = L13_hi + lanes_lo[14] = L14_lo + lanes_hi[14] = L14_hi + lanes_lo[15] = L15_lo + lanes_hi[15] = L15_hi + lanes_lo[16] = L16_lo + lanes_hi[16] = L16_hi + lanes_lo[17] = L17_lo + lanes_hi[17] = L17_hi + lanes_lo[18] = L18_lo + lanes_hi[18] = L18_hi + lanes_lo[19] = L19_lo + lanes_hi[19] = L19_hi + lanes_lo[20] = L20_lo + lanes_hi[20] = L20_hi + lanes_lo[21] = L21_lo + lanes_hi[21] = L21_hi + lanes_lo[22] = L22_lo + lanes_hi[22] = L22_hi + lanes_lo[23] = L23_lo + lanes_hi[23] = L23_hi + lanes_lo[24] = L24_lo + lanes_hi[24] = L24_hi + lanes_lo[25] = L25_lo + lanes_hi[25] = L25_hi + end + end + +end + + +-------------------------------------------------------------------------------- +-- MAGIC NUMBERS CALCULATOR +-------------------------------------------------------------------------------- +-- Q: +-- Is 53-bit "double" math enough to calculate square roots and cube roots of primes with 64 correct bits after decimal point? +-- A: +-- Yes, 53-bit "double" arithmetic is enough. +-- We could obtain first 40 bits by direct calculation of p^(1/3) and next 40 bits by one step of Newton's method. + +do + local function mul(src1, src2, factor, result_length) + -- src1, src2 - long integers (arrays of digits in base 2^24) + -- factor - small integer + -- returns long integer result (src1 * src2 * factor) and its floating point approximation + local result, carry, value, weight = {}, 0.0, 0.0, 1.0 + for j = 1, result_length do + for k = math_max(1, j + 1 - #src2), math_min(j, #src1) do + carry = carry + factor * src1[k] * src2[j + 1 - k] -- "int32" is not enough for multiplication result, that's why "factor" must be of type "double" + end + local digit = carry % 2^24 + result[j] = floor(digit) + carry = (carry - digit) / 2^24 + value = value + digit * weight + weight = weight * 2^24 + end + return result, value + end + + local idx, step, p, one, sqrt_hi, sqrt_lo = 0, {4, 1, 2, -2, 2}, 4, {1}, sha2_H_hi, sha2_H_lo + repeat + p = p + step[p % 6] + local d = 1 + repeat + d = d + step[d % 6] + if d*d > p then -- next prime number is found + local root = p^(1/3) + local R = root * 2^40 + R = mul({R - R % 1}, one, 1.0, 2) + local _, delta = mul(R, mul(R, R, 1.0, 4), -1.0, 4) + local hi = R[2] % 65536 * 65536 + floor(R[1] / 256) + local lo = R[1] % 256 * 16777216 + floor(delta * (2^-56 / 3) * root / p) + if idx < 16 then + root = p^(1/2) + R = root * 2^40 + R = mul({R - R % 1}, one, 1.0, 2) + _, delta = mul(R, R, -1.0, 2) + local hi = R[2] % 65536 * 65536 + floor(R[1] / 256) + local lo = R[1] % 256 * 16777216 + floor(delta * 2^-17 / root) + local idx = idx % 8 + 1 + sha2_H_ext256[224][idx] = lo + sqrt_hi[idx], sqrt_lo[idx] = hi, lo + hi * hi_factor + if idx > 7 then + sqrt_hi, sqrt_lo = sha2_H_ext512_hi[384], sha2_H_ext512_lo[384] + end + end + idx = idx + 1 + sha2_K_hi[idx], sha2_K_lo[idx] = hi, lo % K_lo_modulo + hi * hi_factor + break + end + until p % d == 0 + until idx > 79 +end + +-- Calculating IVs for SHA512/224 and SHA512/256 +for width = 224, 256, 32 do + local H_lo, H_hi = {} + if XOR64A5 then + for j = 1, 8 do + H_lo[j] = XOR64A5(sha2_H_lo[j]) + end + else + H_hi = {} + for j = 1, 8 do + H_lo[j] = XOR32A5(sha2_H_lo[j]) + H_hi[j] = XOR32A5(sha2_H_hi[j]) + end + end + sha512_feed_128(H_lo, H_hi, "SHA-512/"..tostring(width).."\128"..string_rep("\0", 115).."\88", 0, 128) + sha2_H_ext512_lo[width] = H_lo + sha2_H_ext512_hi[width] = H_hi +end + +-- Constants for MD5 +do + local sin, abs, modf = math.sin, math.abs, math.modf + for idx = 1, 64 do + -- we can't use formula floor(abs(sin(idx))*2^32) because its result may be beyond integer range on Lua built with 32-bit integers + local hi, lo = modf(abs(sin(idx)) * 2^16) + md5_K[idx] = hi * 65536 + floor(lo * 2^16) + end +end + +-- Constants for SHA3 +do + local sh_reg = 29 + local function next_bit() + local r = sh_reg % 2 + sh_reg = XOR_BYTE((sh_reg - r) / 2, 142 * r) + return r + end + for idx = 1, 24 do + local lo, m = 0 + for _ = 1, 6 do + m = m and m * m * 2 or 1 + lo = lo + next_bit() * m + end + local hi = next_bit() * m + sha3_RC_hi[idx], sha3_RC_lo[idx] = hi, lo + hi * hi_factor_keccak + end +end + + +-------------------------------------------------------------------------------- +-- MAIN FUNCTIONS +-------------------------------------------------------------------------------- + +local function sha256ext(width, message) + + -- Create an instance (private objects for current calculation) + local H, length, tail = {unpack(sha2_H_ext256[width])}, 0.0, "" + + local function partial(message_part) + if message_part then + if tail then + length = length + #message_part + local offs = 0 + if tail ~= "" and #tail + #message_part >= 64 then + offs = 64 - #tail + sha256_feed_64(H, tail..sub(message_part, 1, offs), 0, 64) + tail = "" + end + local size = #message_part - offs + local size_tail = size % 64 + sha256_feed_64(H, message_part, offs, size - size_tail) + tail = tail..sub(message_part, #message_part + 1 - size_tail) + return partial + else + error("Adding more chunks is not allowed after receiving the result", 2) + end + else + if tail then + local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)} + tail = nil + -- Assuming user data length is shorter than (2^53)-9 bytes + -- Anyway, it looks very unrealistic that someone would spend more than a year of calculations to process 2^53 bytes of data by using this Lua script :-) + -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes + length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move decimal point to the left + for j = 4, 10 do + length = length % 1 * 256 + final_blocks[j] = char(floor(length)) + end + final_blocks = table_concat(final_blocks) + sha256_feed_64(H, final_blocks, 0, #final_blocks) + local max_reg = width / 32 + for j = 1, max_reg do + H[j] = HEX(H[j]) + end + H = table_concat(H, "", 1, max_reg) + end + return H + end + end + + if message then + -- Actually perform calculations and return the SHA256 digest of a message + return partial(message)() + else + -- Return function for chunk-by-chunk loading + -- User should feed every chunk of input data as single argument to this function and finally get SHA256 digest by invoking this function without an argument + return partial + end + +end + + +local function sha512ext(width, message) + + -- Create an instance (private objects for current calculation) + local length, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_ext512_lo[width])}, not HEX64 and {unpack(sha2_H_ext512_hi[width])} + + local function partial(message_part) + if message_part then + if tail then + length = length + #message_part + local offs = 0 + if tail ~= "" and #tail + #message_part >= 128 then + offs = 128 - #tail + sha512_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128) + tail = "" + end + local size = #message_part - offs + local size_tail = size % 128 + sha512_feed_128(H_lo, H_hi, message_part, offs, size - size_tail) + tail = tail..sub(message_part, #message_part + 1 - size_tail) + return partial + else + error("Adding more chunks is not allowed after receiving the result", 2) + end + else + if tail then + local final_blocks = {tail, "\128", string_rep("\0", (-17-length) % 128 + 9)} + tail = nil + -- Assuming user data length is shorter than (2^53)-17 bytes + -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes + length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move floating point to the left + for j = 4, 10 do + length = length % 1 * 256 + final_blocks[j] = char(floor(length)) + end + final_blocks = table_concat(final_blocks) + sha512_feed_128(H_lo, H_hi, final_blocks, 0, #final_blocks) + local max_reg = ceil(width / 64) + if HEX64 then + for j = 1, max_reg do + H_lo[j] = HEX64(H_lo[j]) + end + else + for j = 1, max_reg do + H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j]) + end + H_hi = nil + end + H_lo = sub(table_concat(H_lo, "", 1, max_reg), 1, width / 4) + end + return H_lo + end + end + + if message then + -- Actually perform calculations and return the SHA512 digest of a message + return partial(message)() + else + -- Return function for chunk-by-chunk loading + -- User should feed every chunk of input data as single argument to this function and finally get SHA512 digest by invoking this function without an argument + return partial + end + +end + + +local function md5(message) + + -- Create an instance (private objects for current calculation) + local H, length, tail = {unpack(md5_sha1_H, 1, 4)}, 0.0, "" + + local function partial(message_part) + if message_part then + if tail then + length = length + #message_part + local offs = 0 + if tail ~= "" and #tail + #message_part >= 64 then + offs = 64 - #tail + md5_feed_64(H, tail..sub(message_part, 1, offs), 0, 64) + tail = "" + end + local size = #message_part - offs + local size_tail = size % 64 + md5_feed_64(H, message_part, offs, size - size_tail) + tail = tail..sub(message_part, #message_part + 1 - size_tail) + return partial + else + error("Adding more chunks is not allowed after receiving the result", 2) + end + else + if tail then + local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64)} + tail = nil + length = length * 8 -- convert "byte-counter" to "bit-counter" + for j = 4, 11 do + local low_byte = length % 256 + final_blocks[j] = char(low_byte) + length = (length - low_byte) / 256 + end + final_blocks = table_concat(final_blocks) + md5_feed_64(H, final_blocks, 0, #final_blocks) + for j = 1, 4 do + H[j] = HEX(H[j]) + end + H = gsub(table_concat(H), "(..)(..)(..)(..)", "%4%3%2%1") + end + return H + end + end + + if message then + -- Actually perform calculations and return the MD5 digest of a message + return partial(message)() + else + -- Return function for chunk-by-chunk loading + -- User should feed every chunk of input data as single argument to this function and finally get MD5 digest by invoking this function without an argument + return partial + end + +end + + +local function sha1(message) + + -- Create an instance (private objects for current calculation) + local H, length, tail = {unpack(md5_sha1_H)}, 0.0, "" + + local function partial(message_part) + if message_part then + if tail then + length = length + #message_part + local offs = 0 + if tail ~= "" and #tail + #message_part >= 64 then + offs = 64 - #tail + sha1_feed_64(H, tail..sub(message_part, 1, offs), 0, 64) + tail = "" + end + local size = #message_part - offs + local size_tail = size % 64 + sha1_feed_64(H, message_part, offs, size - size_tail) + tail = tail..sub(message_part, #message_part + 1 - size_tail) + return partial + else + error("Adding more chunks is not allowed after receiving the result", 2) + end + else + if tail then + local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)} + tail = nil + -- Assuming user data length is shorter than (2^53)-9 bytes + -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes + length = length * (8 / 256^7) -- convert "byte-counter" to "bit-counter" and move decimal point to the left + for j = 4, 10 do + length = length % 1 * 256 + final_blocks[j] = char(floor(length)) + end + final_blocks = table_concat(final_blocks) + sha1_feed_64(H, final_blocks, 0, #final_blocks) + for j = 1, 5 do + H[j] = HEX(H[j]) + end + H = table_concat(H) + end + return H + end + end + + if message then + -- Actually perform calculations and return the SHA-1 digest of a message + return partial(message)() + else + -- Return function for chunk-by-chunk loading + -- User should feed every chunk of input data as single argument to this function and finally get SHA-1 digest by invoking this function without an argument + return partial + end + +end + + +local function keccak(block_size_in_bytes, digest_size_in_bytes, is_SHAKE, message) + -- "block_size_in_bytes" is multiple of 8 + if type(digest_size_in_bytes) ~= "number" then + -- arguments in SHAKE are swapped: + -- NIST FIPS 202 defines SHAKE(message,num_bits) + -- this module defines SHAKE(num_bytes,message) + -- it's easy to forget about this swap, hence the check + error("Argument 'digest_size_in_bytes' must be a number", 2) + end + + -- Create an instance (private objects for current calculation) + local tail, lanes_lo, lanes_hi = "", create_array_of_lanes(), hi_factor_keccak == 0 and create_array_of_lanes() + local result + +--~ pad the input N using the pad function, yielding a padded bit string P with a length divisible by r (such that n = len(P)/r is integer), +--~ break P into n consecutive r-bit pieces P0, ..., Pn-1 (last is zero-padded) +--~ initialize the state S to a string of b 0 bits. +--~ absorb the input into the state: For each block Pi, +--~ extend Pi at the end by a string of c 0 bits, yielding one of length b, +--~ XOR that with S and +--~ apply the block permutation f to the result, yielding a new state S +--~ initialize Z to be the empty string +--~ while the length of Z is less than d: +--~ append the first r bits of S to Z +--~ if Z is still less than d bits long, apply f to S, yielding a new state S. +--~ truncate Z to d bits + + local function partial(message_part) + if message_part then + if tail then + local offs = 0 + if tail ~= "" and #tail + #message_part >= block_size_in_bytes then + offs = block_size_in_bytes - #tail + keccak_feed(lanes_lo, lanes_hi, tail..sub(message_part, 1, offs), 0, block_size_in_bytes, block_size_in_bytes) + tail = "" + end + local size = #message_part - offs + local size_tail = size % block_size_in_bytes + keccak_feed(lanes_lo, lanes_hi, message_part, offs, size - size_tail, block_size_in_bytes) + tail = tail..sub(message_part, #message_part + 1 - size_tail) + return partial + else + error("Adding more chunks is not allowed after receiving the result", 2) + end + else + if tail then + -- append the following bits to the message: for usual SHA3: 011(0*)1, for SHAKE: 11111(0*)1 + local gap_start = is_SHAKE and 31 or 6 + tail = tail..(#tail + 1 == block_size_in_bytes and char(gap_start + 128) or char(gap_start)..string_rep("\0", (-2 - #tail) % block_size_in_bytes).."\128") + keccak_feed(lanes_lo, lanes_hi, tail, 0, #tail, block_size_in_bytes) + tail = nil + + local lanes_used = 0 + local total_lanes = floor(block_size_in_bytes / 8) + local qwords = {} + + local function get_next_qwords_of_digest(qwords_qty) + -- returns not more than 'qwords_qty' qwords ('qwords_qty' might be non-integer) + -- doesn't go across keccak-buffer boundary + -- block_size_in_bytes is a multiple of 8, so, keccak-buffer contains integer number of qwords + if lanes_used >= total_lanes then + keccak_feed(lanes_lo, lanes_hi, "\0\0\0\0\0\0\0\0", 0, 8, 8) + lanes_used = 0 + end + qwords_qty = floor(math_min(qwords_qty, total_lanes - lanes_used)) + if hi_factor_keccak ~= 0 then + for j = 1, qwords_qty do + qwords[j] = HEX64(lanes_lo[lanes_used + j - 1 + lanes_index_base]) + end + else + for j = 1, qwords_qty do + qwords[j] = HEX(lanes_hi[lanes_used + j])..HEX(lanes_lo[lanes_used + j]) + end + end + lanes_used = lanes_used + qwords_qty + return + gsub(table_concat(qwords, "", 1, qwords_qty), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), + qwords_qty * 8 + end + + local parts = {} -- digest parts + local last_part, last_part_size = "", 0 + + local function get_next_part_of_digest(bytes_needed) + -- returns 'bytes_needed' bytes, for arbitrary integer 'bytes_needed' + bytes_needed = bytes_needed or 1 + if bytes_needed <= last_part_size then + last_part_size = last_part_size - bytes_needed + local part_size_in_nibbles = bytes_needed * 2 + local result = sub(last_part, 1, part_size_in_nibbles) + last_part = sub(last_part, part_size_in_nibbles + 1) + return result + end + local parts_qty = 0 + if last_part_size > 0 then + parts_qty = 1 + parts[parts_qty] = last_part + bytes_needed = bytes_needed - last_part_size + end + -- repeats until the length is enough + while bytes_needed >= 8 do + local next_part, next_part_size = get_next_qwords_of_digest(bytes_needed / 8) + parts_qty = parts_qty + 1 + parts[parts_qty] = next_part + bytes_needed = bytes_needed - next_part_size + end + if bytes_needed > 0 then + last_part, last_part_size = get_next_qwords_of_digest(1) + parts_qty = parts_qty + 1 + parts[parts_qty] = get_next_part_of_digest(bytes_needed) + else + last_part, last_part_size = "", 0 + end + return table_concat(parts, "", 1, parts_qty) + end + + if digest_size_in_bytes < 0 then + result = get_next_part_of_digest + else + result = get_next_part_of_digest(digest_size_in_bytes) + end + + end + return result + end + end + + if message then + -- Actually perform calculations and return the SHA3 digest of a message + return partial(message)() + else + -- Return function for chunk-by-chunk loading + -- User should feed every chunk of input data as single argument to this function and finally get SHA3 digest by invoking this function without an argument + return partial + end + +end + + +local hex2bin, bin2base64, base642bin +do + + function hex2bin(hex_string) + return (gsub(hex_string, "%x%x", + function (hh) + return char(tonumber(hh, 16)) + end + )) + end + + local base64_symbols = { + ['+'] = 62, ['-'] = 62, [62] = '+', + ['/'] = 63, ['_'] = 63, [63] = '/', + ['='] = -1, ['.'] = -1, [-1] = '=' + } + local symbol_index = 0 + for j, pair in ipairs{'AZ', 'az', '09'} do + for ascii = byte(pair), byte(pair, 2) do + local ch = char(ascii) + base64_symbols[ch] = symbol_index + base64_symbols[symbol_index] = ch + symbol_index = symbol_index + 1 + end + end + + function bin2base64(binary_string) + local result = {} + for pos = 1, #binary_string, 3 do + local c1, c2, c3, c4 = byte(sub(binary_string, pos, pos + 2)..'\0', 1, -1) + result[#result + 1] = + base64_symbols[floor(c1 / 4)] + ..base64_symbols[c1 % 4 * 16 + floor(c2 / 16)] + ..base64_symbols[c3 and c2 % 16 * 4 + floor(c3 / 64) or -1] + ..base64_symbols[c4 and c3 % 64 or -1] + end + return table_concat(result) + end + + function base642bin(base64_string) + local result, chars_qty = {}, 3 + for pos, ch in gmatch(gsub(base64_string, '%s+', ''), '()(.)') do + local code = base64_symbols[ch] + if code < 0 then + chars_qty = chars_qty - 1 + code = 0 + end + local idx = pos % 4 + if idx > 0 then + result[-idx] = code + else + local c1 = result[-1] * 4 + floor(result[-2] / 16) + local c2 = (result[-2] % 16) * 16 + floor(result[-3] / 4) + local c3 = (result[-3] % 4) * 64 + code + result[#result + 1] = sub(char(c1, c2, c3), 1, chars_qty) + end + end + return table_concat(result) + end + +end + + +local block_size_for_HMAC -- this table will be initialized at the end of the module + +local function pad_and_xor(str, result_length, byte_for_xor) + return gsub(str, ".", + function(c) + return char(XOR_BYTE(byte(c), byte_for_xor)) + end + )..string_rep(char(byte_for_xor), result_length - #str) +end + +local function hmac(hash_func, key, message) + + -- Create an instance (private objects for current calculation) + local block_size = block_size_for_HMAC[hash_func] + if not block_size then + error("Unknown hash function", 2) + end + if #key > block_size then + key = hex2bin(hash_func(key)) + end + local append = hash_func()(pad_and_xor(key, block_size, 0x36)) + local result + + local function partial(message_part) + if not message_part then + result = result or hash_func(pad_and_xor(key, block_size, 0x5C)..hex2bin(append())) + return result + elseif result then + error("Adding more chunks is not allowed after receiving the result", 2) + else + append(message_part) + return partial + end + end + + if message then + -- Actually perform calculations and return the HMAC of a message + return partial(message)() + else + -- Return function for chunk-by-chunk loading of a message + -- User should feed every chunk of the message as single argument to this function and finally get HMAC by invoking this function without an argument + return partial + end + +end + + +local sha = { + md5 = md5, -- MD5 + sha1 = sha1, -- SHA-1 + -- SHA2 hash functions: + sha224 = function (message) return sha256ext(224, message) end, -- SHA-224 + sha256 = function (message) return sha256ext(256, message) end, -- SHA-256 + sha512_224 = function (message) return sha512ext(224, message) end, -- SHA-512/224 + sha512_256 = function (message) return sha512ext(256, message) end, -- SHA-512/256 + sha384 = function (message) return sha512ext(384, message) end, -- SHA-384 + sha512 = function (message) return sha512ext(512, message) end, -- SHA-512 + -- SHA3 hash functions: + sha3_224 = function (message) return keccak((1600 - 2 * 224) / 8, 224 / 8, false, message) end, -- SHA3-224 + sha3_256 = function (message) return keccak((1600 - 2 * 256) / 8, 256 / 8, false, message) end, -- SHA3-256 + sha3_384 = function (message) return keccak((1600 - 2 * 384) / 8, 384 / 8, false, message) end, -- SHA3-384 + sha3_512 = function (message) return keccak((1600 - 2 * 512) / 8, 512 / 8, false, message) end, -- SHA3-512 + shake128 = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 128) / 8, digest_size_in_bytes, true, message) end, -- SHAKE128 + shake256 = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 256) / 8, digest_size_in_bytes, true, message) end, -- SHAKE256 + -- misc utilities: + hmac = hmac, -- HMAC(hash_func, key, message) is applicable to any hash function from this module except SHAKE* + hex2bin = hex2bin, -- converts hexadecimal representation to binary string + base642bin = base642bin, -- converts base64 representation to binary string + bin2base64 = bin2base64, -- converts binary string to base64 representation +} + + +block_size_for_HMAC = { + [sha.md5] = 64, + [sha.sha1] = 64, + [sha.sha224] = 64, + [sha.sha256] = 64, + [sha.sha512_224] = 128, + [sha.sha512_256] = 128, + [sha.sha384] = 128, + [sha.sha512] = 128, + [sha.sha3_224] = (1600 - 2 * 224) / 8, + [sha.sha3_256] = (1600 - 2 * 256) / 8, + [sha.sha3_384] = (1600 - 2 * 384) / 8, + [sha.sha3_512] = (1600 - 2 * 512) / 8, +} + + +return sha diff --git a/src/libs/utils.lua b/src/libs/utils.lua index 3e79b89..2c4f3af 100644 --- a/src/libs/utils.lua +++ b/src/libs/utils.lua @@ -1,18 +1,27 @@ local _M = {} +local sha = require("sha") local secret_bucket_duration = 43200 -- 60 * 60 * 12 -- 12 hours -function _M.generate_secret(context, salt, is_applet) + +function _M.generate_secret(context, salt, is_applet, iterations) local start_sec = core.now()['sec'] local bucket = start_sec - (start_sec % secret_bucket_duration) - local ip = context.sf:src() - local user_agent - if is_applet == true then - user_agent = context.headers['user-agent'] or {} - user_agent = user_agent[0] + local ip = context.sf:src() + local user_agent = "" +--TODO: fix bug here making this not be same value +-- if is_applet == true then +-- user_agent = context.headers['user-agent'] or {} +-- user_agent = user_agent[0] +-- else +-- user_agent = context.sf:req_hdr('user-agent') +-- end + if iterations == nil then + --hcaptcha secret is just this + return context.sc:xxh32(salt .. bucket .. ip .. user_agent) else - user_agent = context.sf:req_hdr('user-agent') + --POW secret adds the iteration number by the user + return sha.sha1(salt .. bucket .. ip .. user_agent .. iterations) end - return context.sc:xxh64(salt .. bucket .. ip .. user_agent) end return _M diff --git a/src/scripts/hcaptcha.lua b/src/scripts/hcaptcha.lua index d3b7c51..8bdac47 100644 --- a/src/scripts/hcaptcha.lua +++ b/src/scripts/hcaptcha.lua @@ -5,10 +5,12 @@ local http = require("http") local utils = require("utils") local cookie = require("cookie") local json = require("json") +local sha = require("sha") local captcha_secret = os.getenv("HCAPTCHA_SECRET") local captcha_sitekey = os.getenv("HCAPTCHA_SITEKEY") -local cookie_secret = os.getenv("COOKIE_SECRET") +local hcaptcha_cookie_secret = os.getenv("CAPTCHA_COOKIE_SECRET") +local pow_cookie_secret = os.getenv("POW_COOKIE_SECRET") local captcha_provider_domain = "hcaptcha.com" @@ -21,6 +23,7 @@ local body_template = [[ - +

Captcha completion required

We have detected unusual activity on the requested resource.

Please solve this captcha to prove you are not a robot.

@@ -41,9 +44,10 @@ local body_template = [[
- +
+ ]] @@ -52,7 +56,8 @@ function _M.view(applet) local response_body local response_status_code if applet.method == "GET" then - response_body = string.format(body_template, captcha_sitekey) + generated_work = utils.generate_secret(applet, pow_cookie_secret, true, "") + response_body = string.format(body_template, generated_work, captcha_sitekey) response_status_code = 403 applet:set_status(response_status_code) applet:add_header("content-type", "text/html") @@ -75,7 +80,7 @@ function _M.view(applet) api_response = {} end if api_response.success == true then - local floating_hash = utils.generate_secret(applet, cookie_secret, true) + local floating_hash = utils.generate_secret(applet, hcaptcha_cookie_secret, true, nil) applet:add_header( "set-cookie", string.format("z_ddos_captcha=%s; expires=Thu, 31-Dec-37 23:55:55 GMT; Path=/", floating_hash) @@ -97,13 +102,24 @@ end function _M.check_captcha_status(txn) local parsed_request_cookies = cookie.get_cookie_table(txn.sf:hdr("Cookie")) - local expected_cookie = utils.generate_secret(txn, cookie_secret) - --core.Debug("RECEIVED SECRET COOKIE: " .. parsed_request_cookies["z_ddos_captcha"]) - --core.Debug("EXPECTED SECRET COOKIE: " .. expected_cookie) + local expected_cookie = utils.generate_secret(txn, hcaptcha_cookie_secret, false, nil) if parsed_request_cookies["z_ddos_captcha"] == expected_cookie then - core.Debug("CAPTCHA STATUS CHECK SUCCESS") + --core.Debug("CAPTCHA STATUS CHECK SUCCESS") return txn:set_var("txn.captcha_passed", true) end end +function _M.check_pow_status(txn) + local parsed_request_cookies = cookie.get_cookie_table(txn.sf:hdr("Cookie")) + if parsed_request_cookies["z_ddos_pow"] then + local generated_work = utils.generate_secret(txn, pow_cookie_secret, false, "") + local iterations = parsed_request_cookies["z_ddos_pow"] + local completed_work = sha.sha1(generated_work .. iterations) + local challenge_offset = tonumber(generated_work:sub(1,1),16) * 2 + if completed_work:sub(challenge_offset+1, challenge_offset+4) == 'b00b' then -- i dont know lua properly :^) + return txn:set_var("txn.pow_passed", true) + end + end +end + return _M diff --git a/src/scripts/register.lua b/src/scripts/register.lua index d32c00c..4d491aa 100644 --- a/src/scripts/register.lua +++ b/src/scripts/register.lua @@ -3,4 +3,5 @@ package.path = package.path .. "./?.lua;/usr/local/etc/haproxy/scripts/?.lua;/u local hcaptcha = require("hcaptcha") core.register_service("hcaptcha-view", "http", hcaptcha.view) -core.register_action("hcaptcha-redirect", { 'http-req', }, hcaptcha.check_captcha_status) +core.register_action("hcaptcha-check", { 'http-req', }, hcaptcha.check_captcha_status) +core.register_action("pow-check", { 'http-req', }, hcaptcha.check_pow_status)