From 17961785e37f53c3f8a74b3e16431b80fe463aa8 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Tue, 16 May 2023 16:03:01 +0900 Subject: [PATCH 1/1] Import seahash 4.1.0 --- .cargo_vcs_info.json | 5 + .gitignore | 2 + .gitlab-ci.yml | 24 +++ Cargo.toml | 35 +++++ Cargo.toml.orig | 22 +++ README.md | 11 ++ benches/bench.rs | 85 +++++++++++ logo.png | Bin 0 -> 16443 bytes src/buffer.rs | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/helper.rs | 141 ++++++++++++++++++ src/impl_std.rs | 32 ++++ src/lib.rs | 168 +++++++++++++++++++++ src/reference.rs | 152 +++++++++++++++++++ src/stream.rs | 284 +++++++++++++++++++++++++++++++++++ tests/chunking.rs | 67 +++++++++ tests/quickchecks.rs | 47 ++++++ 16 files changed, 1486 insertions(+) create mode 100644 .cargo_vcs_info.json create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 100644 README.md create mode 100644 benches/bench.rs create mode 100644 logo.png create mode 100644 src/buffer.rs create mode 100644 src/helper.rs create mode 100644 src/impl_std.rs create mode 100644 src/lib.rs create mode 100644 src/reference.rs create mode 100644 src/stream.rs create mode 100644 tests/chunking.rs create mode 100644 tests/quickchecks.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..29e63e6 --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,5 @@ +{ + "git": { + "sha1": "94b632aeac099031c373599313d5b5f0acbbaec0" + } +} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ca98cd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target/ +Cargo.lock diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..a18ea28 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,24 @@ +# This file is a template, and might need editing before it works on your project. +# Official language image. Look for the different tagged releases at: +# https://hub.docker.com/r/library/rust/tags/ +image: "rust:latest" + +# Optional: Pick zero or more services to be used on all builds. +# Only needed when using a docker container to run your tests in. +# Check out: http://docs.gitlab.com/ce/ci/docker/using_docker_images.html#what-is-a-service +# services: +# - mysql:latest +# - redis:latest +# - postgres:latest + +# Optional: Install a C compiler, cmake and git into the container. +# You will often need this when you (or any of your dependencies) depends on C code. +# before_script: +# - apt-get update -yqq +# - apt-get install -yqq --no-install-recommends build-essential + +# Use cargo to test the project +test:cargo: + script: + - rustc --version && cargo --version # Print version info for debugging + - cargo test --all --verbose --all-features diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..bbbf59c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,35 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "seahash" +version = "4.1.0" +authors = ["ticki ", "Tom Almeida "] +exclude = ["target", "Cargo.lock"] +description = "A blazingly fast, portable hash function with proven statistical guarantees." +documentation = "https://docs.rs/seahash" +keywords = ["hash", "hashing", "checksum", "checksumming", "portable"] +license = "MIT" +repository = "https://gitlab.redox-os.org/redox-os/seahash" + +[[bench]] +name = "bench" +harness = false +[dev-dependencies.criterion] +version = "0.3" + +[dev-dependencies.quickcheck] +version = "0.9.2" + +[features] +default = [] +use_std = [] diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..70505b2 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,22 @@ +[package] +name = "seahash" +version = "4.1.0" +authors = ["ticki ", "Tom Almeida "] +description = "A blazingly fast, portable hash function with proven statistical guarantees." +repository = "https://gitlab.redox-os.org/redox-os/seahash" +documentation = "https://docs.rs/seahash" +license = "MIT" +keywords = ["hash", "hashing", "checksum", "checksumming", "portable"] +exclude = ["target", "Cargo.lock"] + +[dev-dependencies] +quickcheck = "0.9.2" +criterion = "0.3" + +[features] +default = [] +use_std = [] + +[[bench]] +name = "bench" +harness = false diff --git a/README.md b/README.md new file mode 100644 index 0000000..db664b1 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +
Logo
+=================== + +SeaHash: A bizarrely fast hash function. + +SeaHash is a hash function with performance better than (around 3-20% improvement) xxHash and +MetroHash. Furthermore, SeaHash has mathematically provable statistical guarantees. + +In action: + +[![The hash function in action.](http://ticki.github.io/img/seahash_construction_diagram.svg)](http://ticki.github.io/img/seahash_construction_diagram.svg) diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..ae22068 --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,85 @@ +extern crate core; +extern crate criterion; +extern crate seahash; + +use core::hash::Hasher; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; + +fn describe_benches(c: &mut Criterion) { + // shared buffers for all tests + let buf = vec![15; 16 * 1024]; + + // shared/n and buffer/n are executed for these sizes + let sizes = [64, 1024, 4096, 16 * 1024]; + + let mut group = c.benchmark_group("buffer"); + + for size in &sizes { + group.throughput(Throughput::Bytes(*size as u64)); + + group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, &size| { + b.iter(|| { + black_box(seahash::hash(&buf[..size])); + }) + }); + } + + group.finish(); + + let mut group = c.benchmark_group("stream"); + + for size in &sizes { + group.throughput(Throughput::Bytes(*size as u64)); + + group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, &size| { + b.iter_with_setup( + || seahash::SeaHasher::default(), + |mut h: seahash::SeaHasher| { + // use chunks of 32 bytes to simulate some looping on a single hasher value + for _ in 0..size / 32 { + h.write(&buf[..32]); + } + // this will mostly be an empty slice, but that is a possible Hasher api usage + h.write(&buf[..(size % 32)]); + black_box(h.finish()) + }, + ) + }); + } + + group.finish(); + + // gigabyte group times are comparable with earlier benchmark values based on + // d52d115a223a0e81d1600bd8a5e73cb4b24a38c0 + let mut group = c.benchmark_group("gigabyte"); + group.throughput(Throughput::Bytes((1024 * 1024 * 1024) as u64)); + + group.bench_function(BenchmarkId::from_parameter("buffer"), |b| { + b.iter(|| { + let mut buf = [15; 4096]; + let mut total = 0; + for _ in 0..250_000 { + total ^= seahash::hash(&buf); + buf[0] = buf[0].wrapping_add(1); + } + black_box(total) + }) + }); + + group.bench_function(BenchmarkId::from_parameter("stream"), |b| { + b.iter(|| { + let mut buf = [15; 4096]; + let mut h = seahash::SeaHasher::default(); + for _ in 0..250_000 { + h.write(&buf); + buf[0] = buf[0].wrapping_add(1); + } + black_box(h.finish()) + }) + }); + + group.finish(); +} + +criterion_group!(benches, describe_benches); +criterion_main!(benches); diff --git a/logo.png b/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..6656ffa60bf4c2e34e73564fd1c06a8a19601120 GIT binary patch literal 16443 zcmXYY2RPf`_kRYl_uiyb5woaOt5HQ!BN3y-s#fh)Y9(rG`Jh(qR*j&I(HJ#i?@_JN zQeuxS^qz?yN{`)IxElVet(D>Z3_6Gpy*#5nMfLD3k0Dv&S0Ip*h{CzwBNj5v; zPy76J>~j|imJIZ0Gqib?f2w6Q?=$Y_lv3@30EWk-k8SQRz51Fb=cXrL^x8@IO8m0o z@X24_opVm>2hF(V=CaDOma)%FQTHbi4Wq4xO-BX7xWpwDlNy#(_afeVi2U~P_9@Ud z&}?#m?(fHPCRDOT_QiVmkZ&HS{opb7>jLFG!#e#JfM;jq5h#-~lkrkhDeM6dR1fF{ z6oRi{)zGhT$wyxU`MJQ!OE2F6n2Vkq!x*Xn2XuvVI_&kF zy)d&N)z$LNyWSyvzVaj$lYLrD^1@p!z%&Kwddsmb*PdG$xEj-dvBmCM9)?HqCNDV= zee)Ju&a(igaaotWM4N?O=~usM+o&rcM_}1<_k1mh|C4kIEuzEA;BVqJ@fzba{ty^P z1kj#JhAI}f$sg(We{#+#v}HXuBB{_)Mf$g8!XbOqKIyo&HPZ=SJ9?F5Hd8=7Q{BY~ zW*eh~c_&5yF=r$YCy9qRwI7A=mKwQz_{AHTr&*`@rXh4^Qd|X8$rh8T7cT|+dN4~W z?cpa45F~^^`G*Jvx(^-%wNoVn<}4>eE)M=z;aFI>mK}c`ufC@aW4Tw8og&5(H6JTH z+Tll6|3>Z;jRn>Mdxi+iw8fG~OcrQ~qek(la8C;v(Oa%(UEGDq5@m@G(e}X$ri_t7 z49_Xq*8#8?G4WS%E3Dk{G4jgim)|wsZ&(edkali%9BT9#Ta9HOg2n_vWmk%G`tjayPHeL(Rk_U4`S{Pbr=qY>ID?5wgzo7O0O9nG`nnT4|DEdH8aGRh8 z-v4V&81NnR6TN^{M=OCQW*M_-p9AcH_PqqaRmGzu#j1$BsA^2TsgG$W;61F_c;SC- z4ZLDrXNME9sAiI4I>iXYo}vI^kDS~!tY-)ULco=94R}pr{4O}sY;tNg?SJa(Vn*@B zDHT`-s(c%~X)rx;nXW@|{xat60kgEP{OT2!IB^UcBJibc#{X2XDjp-&MYnqo?fILj z$u@foPm37?b=75-h<~wE;h0g2A5;2nMpO7j%ttB2x>rTjad|ZB%t5H}-_4vU0v{{m8YCct%1T@|9@TLQgPu^DGsuyvA_=)Hj zQsAYD{IIJ^awJ)xn~~$%!q78NBqcI2xc??`q2S&0Kv8uA(5o}e7aKFqNEu`u03wk$ zv|+=Q21Y1pOuLZXq)9j)lZ)j-1bW2~9!tSyNb^?&6?j#25QRRK0SYc_t(Zzo{;{Xt zJ^Xr6I3f_21niUeVKOUt4EAA1R>;n?w(Vmhfm686zPV0eXzR0U-$gp7eqyW;fsega z#jpHnrT1Z6m7778{~95ZV#9dJnpqa0=EHzY@OD5K?#-Xb(FB7V`6~J33|hNx)PDdx zcBIUB&3VH{=y=?MLtV^kMxbUZS3gz#Y<-0&hEwYipcW_#mA?YBmYF{)h`$XDc@|NGS;|I;k%3?w4Y8(8fuu4r6>=7r z{84sXcI>k;KoHA-M37FR>qh2%pa#kx2bjBE`v5 z=N#2#N+m;{N3tR5*>2p%vehiA=I?9#2p}OzPZ57RY;VXrfS%~FgIlrrHRgTEpJ$f_ z(ojEMv?g!`DsrgMHCA?1Oox-ltCCNkyp)!0aW5##jq#*9|DmVtK32vd2W{E%|HXq( z#hL79`oH@0z}9KunH$SZC}j9(MS=7Hk^gBg1u<8WY%$Jgdj$%{FAH%2q}F_!c~BEY z&oFdYq1KqsI*2tu3ObP2uSPBNvcvaP`ab5mb6Z~-cuH21g7}zC#iTgn>;aKyVJz&J zqmir87@>9eRhs$|NT5PK-}~_c?vjl&*?ZuhsPWD+E}@4 z%pU8x+$VWwKiQq2F-0&v?_j0MhiH`H_FH0-dZ~y%S=y!qr#o|pBqok0AS$3F_$Kqx zxHZuLweFjKtc+b+o;MR~;U1+>9@!7L5>?bVrl(ye2Mp$!mg$^s!o~-fPfE;rm;n2- z+|f5k=8G$$Ika|3F-)WVhpzv&^|_k6!*I=RlST|t5q9^XC!%#IMti>qM<>^gb+{7N z%7CwlKUUdfPivLyYgf@lYi2$;!Q8<&Ra6I5SDy~4N~9>A0kQ!3WO-#c zLAIkKcZDx<)!4V4Mv+Z>8yuD)ZbhNT`=Ts73o$l%plP%cXIup1Nfu88FzG0K%9bD+ z!k{AhT-CS+mTC6e(_2Bjr{)3Eu)gpP?(-qBFzx(7CKAHfnRo3JZFJQW`!%U4jsp`k zEm*ZwGO3zHB?G$1UaM#$`av`nt8FI2fMNVP^Z`(t6X(7Jg;8qcVAwt8SH6wB+*LG> z>0J!y1E`c~Kt&R#(HtQ17Uj*ayGao9fu3aW6o541W9UznksH^6Qnz1Lvf9z1l~^9V z!PT{V%OfuaFOdT^__O>A@UqiA<2>)+s7@5pM%f#e6>b5ef+cMCshLb9TPI0h3Pxq{ z>}2d&26hDP0(TITg)M&bEJ8I3=2gE5xJ##&piNuan?`Bi0ce5P_DO zS_N+(M(rkKp889?MhN=nl}^*oM5;?Li2(LA@a;IV`u?YS8(>!O&7Ams&dx+IJ>>(o z&zTm8;lieW=C-XBh4cc91P9wh>6SWYNp6@k+uPh#&I?fo(ho-Bt^c8Sm z_v0Pr&KasqbZ|}5wV44A73Fef6YQVHr7-oI$S4CY5@Yt4yWS9mO$9B3!YI#KuW}iG z0@|jYfAY^$VTsl@tpY9V$TNkjKfhjKB|q}r{Pf}Q6Jf4ayvM8Llj|Ppd$5Ip3&6=#IJN?J1s^HA>|S<*hxNV~^IM-UlBi$MzDI`C8c|~7NoPCi zItT}eFCSbzbI}T`Z(E-v?|z{8`sA}e=VtWeblVP=c(FWc^@;m%x@}K@c=M$9k6I3i z4y`;c*>{8lsX5KiCfBWVe*rf#PK&G-JLAi-YqTNZ)@bvz;7GA^woxsK6C zrs?|N%3=r!hWOi^Tkpi3oS`_1kSmqVln5-jkyFPl;seAR4d3bm>Q`=lj%n+?>&X_) zb>@bG>&VEDvySoX>eW>UhXnW+n@n^K5)eDOZ~OV z&U4@&AS3aQFWZJm6=&}cqH%#?qaTf`Nd(j;Io-7ehqGu|;U}TcKEuwX0IQ14{GOKa zss|gPeEUNEJ)6Pqi-yk9B)_WJJ;RW^H6`>7#Ldq!m}e)zyD#=y3$odYm)O!D*K9@k zHr+$Km;7`106|YrS4Xfr5Q%uDFP&x^bc)4`E&>O^a`LFfHJ_LQPhwp+ z@Y26k*3t)MNtKE({2Hd|6Xs8ig(YEAQJ`460HVUM>dMS7cRd$bk^zZ5tuMs1%ROqB zVRpxQIj<+a`Fy%YI%}MjNr!&7Ao63EJx5vB3VxlV52YBCCM*31G=T=Mw2qKU?j5CH$dWExaKx?O<3{Jl@QBsmg%(UW5Y zBtzllG5T|?w(PHZv47CmuU3v4<}R)uc0-9DZc%-g8QX86%&Wk82dAqU%OyNC!3tnU zp-{mPb2Hr{&Y{~+b?(Zm2vqt#&Bcv+j)wO8<;{C(`>^Ky{cl)!wbB2A`$3@D^gVc{ zZ|F!p^|~-rA20^$BOg=Wh-}1S;{9Ug7fWzQD4`)Y$zEIC`&}YW?geNzofLj5?30EY zm54bjA3XLCR$V+ocGGdWet0t(f5!p#s81EOj-D9wNykPyZA2F`)!pn%7({tk-{5M! zPjA^hZ7Bwg1dkf}9J>;Iacb?BJK{f`(A=L0w9TNIk0%e(jGt`^!Xs3##DdN$Hsx_@ zv$>N7#C2|`qShN$)^u29;v>`-?YQhX2Rb`TNhy%42*ffFN_b)?w4m9*mMZq+`Ss7= zknE{vm@Ev#z6#>j90rR!jcTY$r+NcT#M;Y12A_JkQ}Tn(dDdB%#-EacNFYSMz55XY z0U29mcroI+ztPm1vJ@2{JE5 zhk(5-CuLg$^A+z+RzMY$hWBuOcWayD?~qq*P1*{%^hLgVj7p9vn33CZaJK-buwLW< zx<%g60DN0+z(63Lmt=~PiC_B74tkGvjWrDrA+ci$yGR8|bD~cI0uBB1PNjS-qHZ~K zo@W?7Wwk+E9owLM{g(DVWO{O>*+tFjLp(oqSS$zE=BvJqw%*#v?GoH>@J}^LrvSM8 zcwA}S8?|pdW$w3rq-%+*oC^`{t>f)*66tG)s(X_7P_nrM1+k_I;?(9wT29os@REch z0^jOb>PrXgT1~XjZpa*+TLkA8uHc_Ti8^ z6nTsR$-e6?ax(W{oRN)pjdk<4S{5))jW?Jf-kje>08Q0!_&Nat zg!)Nk$}2b&zBs~bkUmyy>d!Tsr1s^|WJj96^DTv-)GVh0YL)CK6Z>O`Z@X=ORCF0u z6jtN3T(;XD4{oVy#v|%%l{%k1;Y@Y249w1r)KRnKz>DIQGK-wU)406NV7+J7Bok8C z>bt!aOT{-BrGag>ajh^n><#M&z10Nbyqp%cQG;?TcQWrP!u1bqo^B9ETN#)2(t`Ud z#RcQ+3$f6_KK-RokEWeqMEFn)=JRmaEyZ?hhFqDeIG=8jc&6kri6U+uqsQQ)++=N? z07U%dk%F_m9e$`z;%2cP{iuTzaL2^YGKEoWrLS$2q7OT!%qDZ~C{)Bcv$WZ5?N}cE z9LY%VeQaGt8rSKTRS2Z?DZqVoZDJVobS)C0`IHV%f<2_nlYkcJYqTd@N<*a*`o9r9 zQH@2*zJold{9S>XC_#(XjC@unIR2B}K~Fw)om@m9*=NU?~hYVX+p!%UV9@Xz#Qn?MiksrIzT9u&K z#F}ES*VK2LhDxdPEK0r3xV{q?Gp+VkJ{ghFPe3tZ594=d0+MlpW?s5QRXyZ0sq0Tb zTYT^xDv}1v#_s*qNVR+VFutkJL_{&$O8S#<8Kj*%v(H;xa(}bvkw?Q00wPI^a@~j}dwXF6A2PHx`F;aRZbx-(NSgdW~G7Q&`rLYKd zmBTn95ZBy_yXX_3$bbGDjO>sb%&x7yVP$})+@sFKDl?{IhA`@bsZlE%ZXH5u%*BoHf-RrCVwwrxcFChbS?b7S0>1uUb^qhL zElQFvBD+OyzvuTlHdbbDD{(l{Wr4YRv%g&k-4lbZWqbs}CLEulLoS&pz8WvQJailn zV{D|7mEU^FWPyEuCAmf_>CTxUuaV)yyFW288bZeBu<|-Q^V*P9akZo_efj1aXe~X# ziv7c0w_``PiR{{650*!9AzyZAwm`7_#SN`K{y;%<#Ms!*%g@>7ve2#?SWdoQSgY3R zo#TWCJF`B*9I;|r(rvceFP#j99bye1y?;ep`+}-wWk*Z@JVKxB05!+k*?EP|6#qB= z8^zIazs5z!(_HQE8Ib{ZYATeLr?6`4Q7N3oz0OS0kc3@);XMS31Tu<8u-3CUiYaryFo;V{u|-&->Dm z#22WC9x~+c2HZNC_}lKOqixsORh4rAk{v0-Fv$mZO`e(F^?KD8mst=r1zd*P?0^2_ zeRPgNWYtrW5T11`-~2o$g!qQIO=Lz?_xVaXYpBAe!)4ki2ohz_1I11?AZU@5aA(10 zK+-F~$ZaD_`kc3eaPM}4q@6vqMFtM)@Gmm;esTcg=1EM_|vDT z*Hlt@K}M024pObsk0l$~OxaTTYPyXgjeQbb@g*8vU1%TG)%$lvy4E{!x69;?s~G9K z2>{xh+t$yXw&@`X4@LTYBf4gYU$x)CT_1xVH*??dgl#3n$Oye8?QNhGirYUjQ*T=* z%upn8TZY2sFGREhJ3oJMo}uc(*kX1uW;nI0DL-+H*I#^HSecg?mPB?CfXGf=F?GG{ zRMGpC$1HU3Pq_BXRLd3v@-$920ac*6UY53qH#ouF>dYs(-hf&5ZiHR0lS$!?@=XeF zV=izK@6-#_>taRsloU!$#?Adu(l9g{%;;D1f z$XRr|tXcGGHXVIO7|1@5>0#d^zpZuJJjy)Qg`b7a5>0=_URG68&XnF{+D}OwB10(W zHvd{O=K1~&71-t_Q4kcA-DKsQ{6q_|-{nnq)w7K|bZylD_q#41PthCiaS#N28gPtH zl6>;|Wy`DO%f8zUgBZ;x^;=e)8vL1t&GxRPsnR9`|5!~%Q zQl~DULKfGs-j;!-woyynyfu z-a^qV%H9;QS^5&pb}oxMt%yIT45pLvvSeI&{$j z22uqvr!H*P)s(`ulkmKIk#JjI>z5mjwf;@Gh@aA%Qs5KrH)z4LXISYwncr~i*#glc zYzt%Ft$Dbm5>5N6D7~wmYzsf%1IjG@dOz}ee$%4H{!MW3#p_U$p*oW5=Wtiy7aX9T zybDI=)O?I|P4zQ^Sj0v<=n6IkGgb4N&-pw-%|z#fH~8vnZWLQ3=cW@W(hB^Q(xlrE zCoWN<%tA&Xg!VZPGqAcm?xDj;hZJZxj}4qF&Hcr@!%QT~s?)DiM3?u1>neJDSCD`? zA_3$lPrGjAweo5Ip*k>#N(O?w8~kbwn;u5@rAJ$yp+UP9qg}`C%Rs1_ zHyniA+^XC-j6V?oc~E?QlI)#(XV zBe^e+M4SuwFY-7ocfw$7Bo&X#%SvHTZ>S)kPh!Kjh;s%v16vd+156>!r*q>i9S`p_ zcCyk)XYE@3Y()WFFQcveBFjl6f@~gVC_F zXr9L}Z|;Sl=2C7-7TwRJ7$ut1HW&oIvPPx8Zm-xJ^YZ-1H~AVKzO{<+WNYF7^8>pKsoQ*<=$jBiq7f^gVb8w3gIw?)rsZ9UYT1I%q*X zOY5=zKLnR7m9j$$r8jPu5F>gX!tT=;UqA3WJgNU3^f%+43f^9@j5kp%SRIKF|ER~o zm~z#JOp?JGVy!P@S=u8tGHsWCKc~Sk5&>u-w3Hv0hkIfLF;Y&yquAaPPdzq9hSK|d zX)j)Da3AYGr+5_?oI%X?&ZL3CJW zkJD6mnam9FmrZnQTUFWZexhN#0HnQc$w#C79w@B#twAa*k9 zm$o#aT}!M=lzO!c56k3jyfNNPY*5jP$5bK?Z$&Qtul>9px~O-kw0uv4C)R>!9cEyU ziQZgMF#sSm_AeLUrWOyGw?HEFPc4G}*UT~}#eK|v4J=|?;VEAr5k(3lIl@jI(@u?g z|EYE|n>BMlx8dJ{=#bxTFSziuF{Txp0g!BaaWLG8qd^wE2c}-qQL<v%PG1iLM_mK{-&)w*d#k|rsU5a{f_PY0*g2;Y6oG+e16Q-_tdJp4=hAOkM!X>A~$Gf`9@k%Lg)K6>wsiI1rS9+M*3Gj6fvCrbJjY zk)9-8)i{xa>R&X)sWn7(PqPrE<(WcRrX?Pye}46n&r8?`iJ~c&`P~`-`a@sn0AKbc zMUVm=x)YAMPe@m5P{-!IN5fCtXDwe!wzqqWFo zRXCA){l&`QTq_TpxVZlO?TH5a@ol&lU?+&ls?S#eM!kvo1pQr(yHNL#@QgcQKj=uL zIi!&dpwV5p6cT)J>fbrk&bvSI>Y146sVFsOAv^Lb+>~pJd_Gf`Ly{?EM865T6+bF( z3}leI79aJ~GB8o=DOs*0?Pgcu8*F5BP2=O>0XlpTGo=#BRU6oh=Ar%BH53WUY!1$A zKge(C*1q#5M1b?cJOdlLO`Au@FAKE-Pn{WJmsDJ#%vYrkrm{GBDj#DgceZyG^t1+*1K@UT6VOxYrSW;B?j#L}b-S8FsNKTM|SdAUFWHTX=9LA-zsNE}?&{Qt{2 zWhro2qfS_}-3fc+cjHd7AYXZWa8QSWgFz0VOOZr@_}lZp;tEB-rxQtcZjhW{w?8ls zclz0zQp>J{OxK317v03ua{}*`m&ZIM(N_H-kT(K3X9Gpwm*(ZU?NH-xH5S9Q*B>j& zYfPOMl>NRFP}Q>5OY|$LGylcwlMy%fe_iCfWZj8bA(||`J@d@QYBAsVmVz4v#Kw+| zK%vA+ z?#jE3reK%`ixu9)e{rfPodP03r@&Usjy6>h!@VTc_OFf`uoh@OhH!Eh zQPvcr>h-LGU=~_6d%l_U-0J?VvwGU}-jm|+QpdIjOIS0ZSt2*;}LV$^T16>ms)m4V0kP|ou6IZ)^>_!edjJ60#yECc<4 z1!HbS87mT49r&(>-p6uax3TFMO6YGvAErHRcPDei(I3rUq0CWD$){mZruuA-gg zU|)y=g_QyuKvJ1Le3n34pedk^?h9km*bmDFFcWtFN)by%pD2%Iy~2B3@XNvB z5e~1FCm9NX?+~y<5PDfX$wpmi@_?Q=G1ZFSH zM)wG~#mErvLnnr$n_&MZe-kvDW6-+{$O zX<{r)v*Fg;amGeX_imJhY)K9Av&5)M1JxWysy^Ih-_TBiIdu}#{^?v3)bxjP`(1aBknjOidq3gE_JSYNSuM!Zej7B!9hVOI|_ks{Q+ zT~bED_~_Se(hXs0_?c(8n65{S-TwSf%zw0cG%!!Du%?I(*rf?3&$z zIUK!I2)#!2wq#k{QRH4BKnq||(@fzhx0b`v322Z!iK)AjO|}T!Hbou5xOn~-o(A`y zz5Vfg&~A7jkUMiN9J+d}CTkZeO}P7&1;vWuv{kTs5zzTMLdfz+TyxIFhPjbJrr<+- z&HOBC->G6_p0bUY_RY9=lf}|m>OTSb$mq6(8)vEWoSPsLEvdAsOxh%MG+Sue*Z6~h zadI*lnB;QiDW%=OPbgJ!Z}DE(%R-khvR!MHv04Zv|3D^Pul@GrUw-z07R=j(2y$?# zcRX@|VtsSPpiTC>;1EwuA?_>3LpdY~$Z`T`l%_82&8*+V-oWN!$%MtzD7R0L${8Q# zy*LfX3IQef$D9>x^b+syoMpaU==ChgwNL{|x$69hY6`qtz8S^s`B|)fWR%D#zkOT& zvYu}7{K55mLUV3ofL-U1@KQ+bPkxF|rwbQn>7&XcUelb1OvZuH*G_VnsNKDEu19X- zTejbWZ=eTMkW@G$jku$bh41OJ`#x2;4AJv#SKYom)896bpPUu>Hcq2SNtHu#lJt3L zi@Vj3mdw7o5P^r*x*uo;e_4F_PT}>;5H)hFxNvw*?U6GPSP%R|H5acEaT8cCjr~sX zB7Bt!h}U7~qm>fPjYCwPNVKS%di_3vD&J9B4Y9R)4elLtj9m9A(odkl7U;YX3ke zU$1GO-N|Ndo#;0DC0%5cz%J>&CGbJ$UIvL5*A;l4wx-nb?WNE2$_YUUN1gG#oEIyV zb9@gGASY6HwC5j|D`ieeRrtUFA>>S059so1n&USauaCnNkY?^{ zTJrZ6lp8~%a@I9S{ZLDTS#^z+n8-6abPg-drg42OR}s#d`S(|u#=9jhbW9_Hj;kSG zj&3OX$Z|SY0%3!u)Z78zj_==qP86%kh;A`lr5QQlRIx*pl{U&Y3s*5}6p;1`9Gh_2 zXN~Edf4-dJP6M%#k9BU0{X;uXO?+z)h8gh zVz~cK;Z8ybfsrM3GU@o^N9SKe-(bkh>NQJ!-k6Y#UJie-oavOGE0l1~ot1ns7%(^t zNk7v1etM=8lSHnZuN3Xc^m~&CbLr9A8N6ku?pDW|)a}F+3jpdgw7-D_p2Tn>) zm(CcRv5#p*{<93!^m)`d5G>0}!&5!D|5}qVii8?YSELJ|PTW2RxoK`O9VCPdQ!P74 zD-8PQ>Yi`eN#m!dYDELSThr*3F1<`=xEg{D+!sIQ6O`>n?&^j?M6EeCqF2i9SkWt% zvm;uihj}lVlCDb!y5ooAvXbr;Z&J_;+bbzj3eudN_K+&@tSoVdMnxvUA{AstU>b6( z=ey|Hr`+g?t>h2$Zzv;7_6-P#dwf^BHX?H8`YxelF7RkSLCDs|en zklb#|t?M-u&>klWIhhW3eZJkF$rfKleY5$gx0a2mvahUU=^f`Y)l|P#vAm%iTXw-} zIoX6e^+x;<9J&z>MP@$?qS371#OvbqUx{<161r2M-1-(3(9dp=OvrV{n*$F9$noUs zbQgYi;H)szHe_EuIe_k5sW+Qq7g3z5KtUP!+G4T>_w)_UD^6^3nzNC6AAcS*;iKU} zVvmk2;)wO)*BkE(Ib;4bk zNFtH&95g*rA_y(G_K?zpOB0>(un+yVAhH`N*5@Ra+!r&?ZxxMqFZ~N5%l-%m#iJCD4CAU-VmstJ;bAQT?!1XrQpo3QNx=* zoEJpx922IIrqY6|YmE%0@{lo>)JW&z?nggNIS7H@zx>$ET~oIN{|UPf48 z?c-M>);jMdSLz>1{;*0t7dMs=`C(d_ZraC<%sz=?fLVxV*k6{$+zFrGUQ`}f)xt=M zFfy^2a7U;C#RaD0+YUh7)Z!kplF0M3W!cE?L!0YeF%TGxj#}nnPGzWhx{?y(X1)F6 zvqUYk58t`8mE&l%*kQJuP^}IM*c_SAFX`1P1L5SP6av`-cb7wOZ?TiVRoY(N_S4Fu zsTlnaUh*QIzPBYgGG+Hny@v}(K23!zB74n8Or<^#v2c zlCY2^0Ve)fF(t!6qJ}v;U0VycE3-YX)Kp8;ST}z|N7||bkwvUUcU*4A3(&&Mua$4e z1=e8>1KAn*tz|_lH{7saDV8x(^A?HOLn1tFIC_+|B_j_pic8rt-b%^4-UlF zWUC=N!Vw|`kWA60sg}jb-B-NPk1<2nmi+pYXV5u`QA~J#*)?=euwlj|rjibo`n5;9 zmz%ET)!sBx`$(f3Bi{coFCY8&m#cN3r(9ALx)f&*s)`ReNGVb5Wa4t=LR&>Tkuj19 z0Vaco)si-Qn*F9m+MywQ5nQH*Q{@)? z`^o3T+r~Nb^Zjy0-l;|R(_$-NfCpkKUd%qXAm0@W5h?faqGkzwMSFK0i0&bgU6DgE zqo$6retZ&HbwP%Hz2r!y;2H|h!lKF0#{JmQ=uiqBdOYO#DBhOi93_e5R} zTy7)`!QJ70P=b?$8bEngD>lC<6r_IWeyx!Px4E{*nfd592clYbT3+{;yT&Zt% zj!DU*IZLdS^0PJ1G}F-6z#Dr^EE_LVLuZ7C3F&g+W7U-T0$>>5 zel~%a?CS1j#5}L#rz}?*zi4(XVm*CHd0m;?J55Qk!`HKA<2O6m#HnU3wTrEO3X$6j zIzBSD=Xgkycl&oYR4bP2sX2!&RZAgh{4c!3ndiH;?P4f}_{h|MFFCc{gjfeT8#1V0+8cfEqIA#mIGsVUQC+8UBudI2*Z2U zdGcMa%&Q+}%SGBE4Hxa0XdXT5QCD9{m_aLu;A!`0_e$fdazD9g4UK3Ob5nB^?mX_4 zdyKh-P0?GXCWouKy&qY8BnFP=I03R{J`#njk&%_WD)-^1oHecA6Z;D&Wj4)B!}#K#@^b~y`Kh^}KHKAA= zd5*Q=f0$D@A1KegsHrVEIaaz0nloDPy9(UBs`8G@$3ouEJx)3y5_klDlGx8mX}Rj_ zpV`nUS;CPd@T;}Quh&yeRIy!lr-y@pM3TCd8jZfO+Z>+&psWtVywj2>waDv2d$&Nos z@#)uwedU=g=%dB5>oDLut3Dz`00~HvVo|-~afAK44;5PP*1IkR#?>wjQZlJL4e~w> z##bDjwf`47A-3`d$I!WKM-d}Rf79C{D?F`k&*>(t7d)qo(D;* zgP4H#{lh+JApz?mEE^8dKQ|fj^v}2F3-8l+JMgGbsN|pj*1my<`N+$e!|j<$?uoeI`%&vo!Y!f~CZ|emwrHcd8{jv3E-1NI;c%A^pR;lwf(rlz9H|0}YNrWztXNbb!F>LZ@4}r$pT(*S z^RjQhuDDmSoDu!HJP==)JMr9}=7J>)IAvj%;#!90_*Bl4s)SV^h)lSdm}n;*mh2gff+}!UQfXcZ zTXpJ*`5v8Reyd_gS6y6_LBj#FAHj6acI+=W*v>nx7IrnU-v}l6j0I)>l7}vEaCDk= zPLkM*${>iQf~srSi^Ac{!|b}!4qfhT;=blpz+{R)j81z%(vQX%=P0~2!|@fu?zgTu zOk$^}&o3#qK7iGKk)3ZciE|6rZQz1k(7L(j=@Y8)CHY;UPT^@~BbV3!XqV(IdbH8j zNCpDW>rcJ$V|ZH*qSDHY7_da>E%TyHJg_IGnf zWpXlagjKoFy-9hfp9tYg_J`Re&=x3wEmZ&!Xw#{jEepLt`77tP$+JI>E6&clz2uGM zWZ(0;p4^k*ba0#zG=bxE#X}(bqLCRcq3+gi8Tapnw@|kqzK8xbDgM5Z zTC}{s5JD2z4KlgDsC;aTtq1u3xw!u8maI`o)DxQmshkW6NR-)myjZJ>P`0&*F3FsP zBiUK5o7`Yh;W<`t-u`k=^Po9w#(;dY$*ojalAr7@v?^CvP4d;m*H(qjk#7+Q${wd1 zUxVf3@N#<~ z`9Q$g-`jE}$bmyZ#{xcVJ#^l$l42|cUgNHZUn_s8L;8n0-!9)Yn4BXX$fv8+I32eLMrqdzleIGG}jLr56UV|eu~JEag3rJ;t3Tb zHp{=5D$YT-%>H@+E_EdAlAPriEeCF@#vIMdWqaUoYo`;hmz23HUqo0=T3}o;&uW<4 zoa`=PFsq?nA^~?wu=QyEQsN?<-buG5$nxl!m>h3>hN3-_R9sHb>4*d>;^GMzt!6Ux zR6_5>rZ6UUiqTbdG72e*8FfL+SlE=$dq=e)oG^1J6buk`K49x5KyB2W~?rE zINmd6h4b_Pq882gb?;F^$mD2O=4NRO*Yp@l{*8ByH0C`fHG|cNSCZph;>xx?<<5}kvfrgIP^$2gYG5SOfdDv(EqoFUc;se9>q}-@!4>G7P_E)${=1-E;Ze7pP bCGeG$wZDL|lOXx-KfplG1pZDJ9rgbJhGUhG literal 0 HcmV?d00001 diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 0000000..72dcace --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,411 @@ +//! A highly optimized version of SeaHash. + +use std::slice; + +use helper; + +/// A SeaHash state. +#[derive(Clone)] +pub struct State { + /// `a` + a: u64, + /// `b` + b: u64, + /// `c` + c: u64, + /// `d` + d: u64, + /// The number of written bytes. + written: u64, +} + +impl State { + /// Create a new state vector with some initial values. + pub fn new(a: u64, b: u64, c: u64, d: u64) -> State { + State { + a: a, + b: b, + c: c, + d: d, + written: 0, + } + } + + /// Hash a buffer with some seed. + pub fn hash(buf: &[u8], (mut a, mut b, mut c, mut d): (u64, u64, u64, u64)) -> State { + unsafe { + // We use 4 different registers to store seperate hash states, because this allows us + // to update them seperately, and consequently exploiting ILP to update the states in + // parallel. + + // The pointer to the current bytes. + let mut ptr = buf.as_ptr(); + // The end of the "main segment", i.e. the biggest buffer s.t. the length is divisible + // by 32. + let end_ptr = buf.as_ptr().offset(buf.len() as isize & !0x1F); + + while end_ptr > ptr { + // Modern CPUs allow the pointer arithmetic to be done in place, hence not + // introducing tmpvars. + a ^= helper::read_u64(ptr); + b ^= helper::read_u64(ptr.offset(8)); + c ^= helper::read_u64(ptr.offset(16)); + d ^= helper::read_u64(ptr.offset(24)); + + // Increment the pointer. + ptr = ptr.offset(32); + + // Diffuse the updated registers. We hope that each of these are executed in + // parallel. + a = helper::diffuse(a); + b = helper::diffuse(b); + c = helper::diffuse(c); + d = helper::diffuse(d); + } + + // Calculate the number of excessive bytes. These are bytes that could not be handled + // in the loop above. + let mut excessive = buf.len() as usize + buf.as_ptr() as usize - end_ptr as usize; + // Handle the excessive bytes. + match excessive { + 0 => {} + 1..=7 => { + // 1 or more excessive. + + // Write the last excessive bytes (<8 bytes). + a ^= helper::read_int(slice::from_raw_parts(ptr as *const u8, excessive)); + + // Diffuse. + a = helper::diffuse(a); + } + 8 => { + // 8 bytes excessive. + + // Mix in the partial block. + a ^= helper::read_u64(ptr); + + // Diffuse. + a = helper::diffuse(a); + } + 9..=15 => { + // More than 8 bytes excessive. + + // Mix in the partial block. + a ^= helper::read_u64(ptr); + + // Write the last excessive bytes (<8 bytes). + excessive = excessive - 8; + b ^= helper::read_int(slice::from_raw_parts(ptr.offset(8), excessive)); + + // Diffuse. + a = helper::diffuse(a); + b = helper::diffuse(b); + } + 16 => { + // 16 bytes excessive. + + // Mix in the partial block. + a = helper::diffuse(a ^ helper::read_u64(ptr)); + b = helper::diffuse(b ^ helper::read_u64(ptr.offset(8))); + } + 17..=23 => { + // 16 bytes or more excessive. + + // Mix in the partial block. + a ^= helper::read_u64(ptr); + b ^= helper::read_u64(ptr.offset(8)); + + // Write the last excessive bytes (<8 bytes). + excessive = excessive - 16; + c ^= helper::read_int(slice::from_raw_parts(ptr.offset(16), excessive)); + + // Diffuse. + a = helper::diffuse(a); + b = helper::diffuse(b); + c = helper::diffuse(c); + } + 24 => { + // 24 bytes excessive. + + // Mix in the partial block. + a ^= helper::read_u64(ptr); + b ^= helper::read_u64(ptr.offset(8)); + c ^= helper::read_u64(ptr.offset(16)); + + // Diffuse. + a = helper::diffuse(a); + b = helper::diffuse(b); + c = helper::diffuse(c); + } + _ => { + // More than 24 bytes excessive. + + // Mix in the partial block. + a ^= helper::read_u64(ptr); + b ^= helper::read_u64(ptr.offset(8)); + c ^= helper::read_u64(ptr.offset(16)); + + // Write the last excessive bytes (<8 bytes). + excessive = excessive - 24; + d ^= helper::read_int(slice::from_raw_parts(ptr.offset(24), excessive)); + + // Diffuse. + a = helper::diffuse(a); + b = helper::diffuse(b); + c = helper::diffuse(c); + d = helper::diffuse(d); + } + } + } + + State { + a: a, + b: b, + c: c, + d: d, + written: buf.len() as u64, + } + } + + /// Write another 64-bit integer into the state. + pub fn push(&mut self, x: u64) { + // Mix `x` into `a`. + let a = helper::diffuse(self.a ^ x); + + // Rotate around. + // _______________________ + // | v + // a <---- b <---- c <---- d + self.a = self.b; + self.b = self.c; + self.c = self.d; + self.d = a; + + // Increase the written bytes counter. + self.written += 8; + } + + /// Remove the most recently written 64-bit integer from the state. + /// + /// Given the value of the most recently written u64 `last`, remove it from the state. + pub fn pop(&mut self, last: u64) { + // Un-mix `last` from `d`. Removes the recently written data. + let d = helper::undiffuse(self.d) ^ last; + + // Rotate back. + // _______________________ + // v | + // a ----> b ----> c ----> d + self.d = self.c; + self.c = self.b; + self.b = self.a; + self.a = d; + + // Decrese the written bytes counter. + self.written -= 8; + } + + /// Finalize the state. + #[inline] + pub fn finalize(self) -> u64 { + let State { + written, + mut a, + b, + mut c, + d, + } = self; + + // XOR the states together. Even though XOR is commutative, it doesn't matter, because the + // state vector's initial components are mutually distinct, and thus swapping even and odd + // chunks will affect the result, because it is sensitive to the initial condition. + a ^= b; + c ^= d; + a ^= c; + // XOR the number of written bytes in order to make the excessive bytes zero-sensitive + // (without this, two excessive zeros would be equivalent to three excessive zeros). This + // is know as length padding. + a ^= written; + + // We diffuse to make the excessive bytes discrete (i.e. small changes shouldn't give small + // changes in the output). + helper::diffuse(a) + } +} + +/// Hash some buffer. +/// +/// This is a highly optimized implementation of SeaHash. It implements numerous techniques to +/// improve performance: +/// +/// - Register allocation: This makes a great deal out of making sure everything fits into +/// registers such that minimal memory accesses are needed. This works quite successfully on most +/// CPUs, and the only time it reads from memory is when it fetches the data of the buffer. +/// - Bulk reads: Like most other good hash functions, we read 8 bytes a time. This obviously +/// improves performance a lot +/// - Independent updates: We make sure very few statements next to each other depends on the +/// other. This means that almost always the CPU will be able to run the instructions in parallel. +/// - Loop unrolling: The hot loop is unrolled such that very little branches (one every 32 bytes) +/// are needed. +/// +/// and more. +/// +/// The seed of this hash function is prechosen. +pub fn hash(buf: &[u8]) -> u64 { + hash_seeded( + buf, + 0x16f11fe89b0d677c, + 0xb480a793d8e6c86c, + 0x6fe2e5aaf078ebc9, + 0x14f994a4c5259381, + ) +} + +/// Hash some buffer according to a chosen seed. +/// +/// The keys are expected to be chosen from a uniform distribution. The keys should be mutually +/// distinct to avoid issues with collisions if the lanes are permuted. +/// +/// This is not secure, as [the key can be extracted with a bit of computational +/// work](https://github.com/ticki/tfs/issues/5), as such, it is recommended to have a fallback +/// hash function (adaptive hashing) in the case of hash flooding. It can be considered unbroken if +/// the output is not known (i.e. no malicious party has access to the raw values of the keys, only +/// a permutation thereof).), however I absolutely do not recommend using it for this. If you want +/// to be strict, this should only be used as a layer of obfuscation, such that the fallback (e.g. +/// SipHash) is harder to trigger. +/// +/// In the future, I might strengthen the security if possible while having backward compatibility +/// with the default initialization vector. +pub fn hash_seeded(buf: &[u8], a: u64, b: u64, c: u64, d: u64) -> u64 { + State::hash(buf, (a, b, c, d)).finalize() +} + +#[cfg(test)] +mod tests { + use super::*; + + use reference; + + fn hash_match(a: &[u8]) { + assert_eq!(hash(a), reference::hash(a)); + assert_eq!( + hash_seeded(a, 1, 1, 1, 1), + reference::hash_seeded(a, 1, 1, 1, 1) + ); + assert_eq!( + hash_seeded(a, 500, 2873, 2389, 9283), + reference::hash_seeded(a, 500, 2873, 2389, 9283) + ); + assert_eq!( + hash_seeded(a, 238945723984, 872894734, 239478243, 28937498234), + reference::hash_seeded(a, 238945723984, 872894734, 239478243, 28937498234) + ); + assert_eq!( + hash_seeded(a, !0, !0, !0, !0), + reference::hash_seeded(a, !0, !0, !0, !0) + ); + assert_eq!( + hash_seeded(a, 0, 0, 0, 0), + reference::hash_seeded(a, 0, 0, 0, 0) + ); + } + + #[test] + #[cfg_attr(miri, ignore)] // very slow to run on miri + fn zero() { + let arr = [0; 4096]; + for n in 0..4096 { + hash_match(&arr[0..n]); + } + } + + #[test] + fn seq() { + let mut buf = [0; 4096]; + for i in 0..4096 { + buf[i] = i as u8; + } + hash_match(&buf); + } + + #[test] + fn position_depedent() { + let mut buf1 = [0; 4098]; + for i in 0..4098 { + buf1[i] = i as u8; + } + let mut buf2 = [0; 4098]; + for i in 0..4098 { + buf2[i] = i as u8 ^ 1; + } + + assert!(hash(&buf1) != hash(&buf2)); + } + + #[test] + fn shakespear() { + hash_match(b"to be or not to be"); + hash_match(b"love is a wonderful terrible thing"); + } + + #[test] + fn zero_senitive() { + assert_ne!(hash(&[1, 2, 3, 4]), hash(&[1, 0, 2, 3, 4])); + assert_ne!(hash(&[1, 2, 3, 4]), hash(&[1, 0, 0, 2, 3, 4])); + assert_ne!(hash(&[1, 2, 3, 4]), hash(&[1, 2, 3, 4, 0])); + assert_ne!(hash(&[1, 2, 3, 4]), hash(&[0, 1, 2, 3, 4])); + assert_ne!(hash(&[0, 0, 0]), hash(&[0, 0, 0, 0, 0])); + } + + #[test] + fn not_equal() { + assert_ne!(hash(b"to be or not to be "), hash(b"to be or not to be")); + assert_ne!(hash(b"jkjke"), hash(b"jkjk")); + assert_ne!(hash(b"ijkjke"), hash(b"ijkjk")); + assert_ne!(hash(b"iijkjke"), hash(b"iijkjk")); + assert_ne!(hash(b"iiijkjke"), hash(b"iiijkjk")); + assert_ne!(hash(b"iiiijkjke"), hash(b"iiiijkjk")); + assert_ne!(hash(b"iiiiijkjke"), hash(b"iiiiijkjk")); + assert_ne!(hash(b"iiiiiijkjke"), hash(b"iiiiiijkjk")); + assert_ne!(hash(b"iiiiiiijkjke"), hash(b"iiiiiiijkjk")); + assert_ne!(hash(b"iiiiiiiijkjke"), hash(b"iiiiiiiijkjk")); + assert_ne!(hash(b"ab"), hash(b"bb")); + } + + #[test] + fn push() { + let mut state = State::new(1, 2, 3, 4); + state.push(!0); + state.push(0); + + assert_eq!( + hash_seeded( + &[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0], + 1, + 2, + 3, + 4 + ), + state.finalize() + ); + } + + #[test] + fn pop() { + let mut state = State::new(1, 2, 3, 4); + state.push(!0); + state.push(0); + state.pop(0); + + assert_eq!( + hash_seeded( + &[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + 1, + 2, + 3, + 4 + ), + state.finalize() + ); + } +} diff --git a/src/helper.rs b/src/helper.rs new file mode 100644 index 0000000..567880a --- /dev/null +++ b/src/helper.rs @@ -0,0 +1,141 @@ +//! Helper functions. + +/// Read a buffer smaller than 8 bytes into an integer in little-endian. +/// +/// This assumes that `buf.len() < 8`. If this is not satisfied, the behavior is unspecified. +#[inline(always)] +pub fn read_int(buf: &[u8]) -> u64 { + // Because we want to make sure that it is register allocated, we fetch this into a variable. + // It will likely make no difference anyway, though. + let ptr = buf.as_ptr(); + + unsafe { + // Break it down to reads of integers with widths in total spanning the buffer. This minimizes + // the number of reads + match buf.len() { + // u8. + 1 => *ptr as u64, + // u16. + 2 => (ptr as *const u16).read_unaligned().to_le() as u64, + // u16 + u8. + 3 => { + let a = (ptr as *const u16).read_unaligned().to_le() as u64; + let b = *ptr.offset(2) as u64; + + a | (b << 16) + } + // u32. + 4 => (ptr as *const u32).read_unaligned().to_le() as u64, + // u32 + u8. + 5 => { + let a = (ptr as *const u32).read_unaligned().to_le() as u64; + let b = *ptr.offset(4) as u64; + + a | (b << 32) + } + // u32 + u16. + 6 => { + let a = (ptr as *const u32).read_unaligned().to_le() as u64; + let b = (ptr.offset(4) as *const u16).read_unaligned().to_le() as u64; + + a | (b << 32) + } + // u32 + u16 + u8. + 7 => { + let a = (ptr as *const u32).read_unaligned().to_le() as u64; + let b = (ptr.offset(4) as *const u16).read_unaligned().to_le() as u64; + let c = *ptr.offset(6) as u64; + + a | (b << 32) | (c << 48) + } + _ => 0, + } + } +} + +/// Read a little-endian 64-bit integer from some buffer. +#[inline(always)] +pub unsafe fn read_u64(ptr: *const u8) -> u64 { + #[cfg(target_pointer_width = "32")] + { + // We cannot be sure about the memory layout of a potentially emulated 64-bit integer, so + // we read it manually. If possible, the compiler should emit proper instructions. + let a = (ptr as *const u32).read_unaligned().to_le(); + let b = (ptr.offset(4) as *const u32).read_unaligned().to_le(); + + a as u64 | ((b as u64) << 32) + } + + #[cfg(target_pointer_width = "64")] + { + (ptr as *const u64).read_unaligned().to_le() + } +} + +/// The diffusion function. +/// +/// This is a bijective function emitting chaotic behavior. Such functions are used as building +/// blocks for hash functions. +pub const fn diffuse(mut x: u64) -> u64 { + // These are derived from the PCG RNG's round. Thanks to @Veedrac for proposing this. The basic + // idea is that we use dynamic shifts, which are determined by the input itself. The shift is + // chosen by the higher bits, which means that changing those flips the lower bits, which + // scatters upwards because of the multiplication. + + x = x.wrapping_mul(0x6eed0e9da4d94a4f); + let a = x >> 32; + let b = x >> 60; + x ^= a >> b; + x = x.wrapping_mul(0x6eed0e9da4d94a4f); + + x +} + +/// Reverse the `diffuse` function. +pub const fn undiffuse(mut x: u64) -> u64 { + // 0x2f72b4215a3d8caf is the modular multiplicative inverse of the constant used in `diffuse`. + + x = x.wrapping_mul(0x2f72b4215a3d8caf); + let a = x >> 32; + let b = x >> 60; + x ^= a >> b; + x = x.wrapping_mul(0x2f72b4215a3d8caf); + + x +} + +#[cfg(test)] +mod tests { + use super::*; + + fn diffuse_test(x: u64, y: u64) { + assert_eq!(diffuse(x), y); + assert_eq!(x, undiffuse(y)); + assert_eq!(undiffuse(diffuse(x)), x); + } + + #[test] + fn read_int_() { + assert_eq!(read_int(&[2, 3]), 770); + assert_eq!(read_int(&[3, 2]), 515); + assert_eq!(read_int(&[3, 2, 5]), 328195); + } + + #[test] + fn read_u64_() { + unsafe { + assert_eq!(read_u64([1, 0, 0, 0, 0, 0, 0, 0].as_ptr()), 1); + assert_eq!(read_u64([2, 1, 0, 0, 0, 0, 0, 0].as_ptr()), 258); + } + } + + #[test] + fn diffuse_test_vectors() { + diffuse_test(94203824938, 17289265692384716055); + diffuse_test(0xDEADBEEF, 12110756357096144265); + diffuse_test(0, 0); + diffuse_test(1, 15197155197312260123); + diffuse_test(2, 1571904453004118546); + diffuse_test(3, 16467633989910088880); + } +} diff --git a/src/impl_std.rs b/src/impl_std.rs new file mode 100644 index 0000000..bec3483 --- /dev/null +++ b/src/impl_std.rs @@ -0,0 +1,32 @@ +use crate::SeaHasher; +use std::hash::Hasher; +use std::io; + +impl io::Write for SeaHasher { + fn write(&mut self, buf: &[u8]) -> io::Result { + Hasher::write(self, buf); + Ok(buf.len()) + } + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn hash_write_trait() { + let reader: &[u8] = &[ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ]; + let mut hasher = SeaHasher::new(); + // io::copy consumes the mutable reader -> cloning the buffer + let _ = io::copy(&mut reader.clone(), &mut hasher).unwrap(); + let hash = hasher.finish(); + let control = crate::hash(&reader); + assert_eq!(control, hash); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..45799d8 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,168 @@ +//! SeaHash: A blazingly fast, portable hash function with proven statistical guarantees. +//! +//! SeaHash is a hash function with performance better than (around 3-20% improvement) xxHash and +//! MetroHash. Furthermore, SeaHash has mathematically provable statistical guarantees. +//! +//! SeaHash is a portable hash function, meaning that the output is not dependent on the hosting +//! architecture, and makes no assumptions on endianness or the alike. This stable layout allows it +//! to be used for on-disk/permanent storage (e.g. checksums). +//! +//! # Design, advantages, and features +//! +//! - **High quality**: It beats most other general purpose hash functions because it provides full +//! avalanche inbetween state updates. +//! - **Performance**: SeaHash beats every high-quality (grading 10/10 in smhasher) hash function +//! that I know of. +//! - **Provable quality guarantees**: Contrary to most other non-cryptographic hash function, +//! SeaHash can be proved to satisfy the avalanche criterion as well as BIC. +//! - **Parallelizable**: Consists of multiple, independent states to take advantage of ILP and/or +//! software threads. +//! - **Bulk reads**: Reads 8 or 4 bytes a time. +//! - **Stable and portable**: Does not depend on the target architecture, and produces a stable +//! value, which is only changed in major version bumps. +//! - **Keyed**: Designed to not leak the seed/key. Note that it has not gone through +//! cryptoanalysis yet, so the keyed version shouldn't be relied on when security is needed. +//! - **Hardware accelerateable**: SeaHash is designed such that ASICs can implement it with really +//! high performance. +//! +//! # A word of warning! +//! +//! This is **not** a cryptographic function, and it certainly should not be used as one. If you +//! want a good cryptographic hash function, you should use SHA-3 (Keccak) or BLAKE2. +//! +//! It is not secure, nor does it aim to be. It aims to have high quality pseudorandom output and +//! few collisions, as well as being fast. +//! +//! # Benchmark +//! +//! On normal hardware, it is expected to run with a rate around 5.9-6.7 GB/S on a 2.5 GHz CPU. +//! Further improvement can be seen when hashing very big buffers in parallel. +//! +//! | Function | Quality | Cycles per byte (lower is better) | Author +//! |-------------|---------------|-----------------------------------|------------------- +//! | **SeaHash** | **Excellent** | **0.24** | **Ticki** +//! | xxHash | Excellent | 0.31 | Collet +//! | MetroHash | Excellent | 0.35 | Rogers +//! | Murmur | Excellent | 0.64 | Appleby +//! | Rabin | Medium | 1.51 | Rabin +//! | CityHash | Excellent | 1.62 | Pike, Alakuijala +//! | LoseLose | Terrible | 2.01 | Kernighan, Ritchie +//! | FNV | Poor | 3.12 | Fowler, Noll, Vo +//! | SipHash | Pseudorandom | 3.21 | Aumasson, Bernstein +//! | CRC | Good | 3.91 | Peterson +//! | DJB2 | Poor | 4.13 | Bernstein +//! +//! ## Ideal architecture +//! +//! SeaHash is designed and optimized for the most common architecture in use: +//! +//! - Little-endian +//! - 64-bit +//! - 64 or more bytes cache lines +//! - 4 or more instruction pipelines +//! - 4 or more 64-bit registers +//! +//! Anything that does not hold the above requirements will perform worse by up to 30-40%. Note that +//! this means it is still faster than CityHash (~1 GB/S), MurMurHash (~2.6 GB/S), FNV (~0.5 GB/S), +//! etc. +//! +//! # Achieving the performance +//! +//! Like any good general-purpose hash function, SeaHash reads 8 bytes at once effectively reducing +//! the running time by an order of ~5. +//! +//! Secondly, SeaHash achieves the performance by heavily exploiting Instruction-Level Parallelism. +//! In particular, it fetches 4 integers in every round and independently diffuses them. This +//! yields four different states, which are finally combined. +//! +//! # Statistical guarantees +//! +//! SeaHash comes with certain proven guarantees about the statistical properties of the output: +//! +//! 1. Pick some _n_-byte sequence, _s_. The number of _n_-byte sequence colliding with _s_ is +//! independent of the choice of _s_ (all equivalence class have equal size). +//! 2. If you flip any bit in the input, the probability for any bit in the output to be flipped is +//! 0.5. +//! 3. The hash value of a sequence of uniformly distributed bytes is itself uniformly distributed. +//! +//! The first guarantee can be derived through deduction, by proving that the diffusion function is +//! bijective (reverse the XORs and find the congruence inverses to the primes). +//! +//! The second guarantee requires more complex calculations: Construct a matrix of probabilities +//! and set one to certain (1), then apply transformations through the respective operations. The +//! proof is a bit long, but relatively simple. +//! +//! The third guarantee requires proving that the hash value is a tree, such that: +//! - Leafs represents the input values. +//! - Single-child nodes reduce to the diffusion of the child. +//! - Multiple-child nodes reduce to the sum of the children. +//! +//! Then simply show that each of these reductions transform uniformly distributed variables to +//! uniformly distributed variables. +//! +//! # Inner workings +//! +//! In technical terms, SeaHash follows a alternating 4-state length-padded Merkle–Damgård +//! construction with an XOR-diffuse compression function (click to enlarge): +//! +//! [![A diagram.](http://ticki.github.io/img/seahash_construction_diagram.svg)] +//! (http://ticki.github.io/img/seahash_construction_diagram.svg) +//! +//! It starts with 4 initial states, then it alternates between them (increment, wrap on 4) and +//! does XOR with the respective block. When a state has been visited the diffusion function (f) is +//! applied. The very last block is padded with zeros. +//! +//! After all the blocks have been gone over, all the states are XOR'd to the number of bytes +//! written. The sum is then passed through the diffusion function, which produces the final hash +//! value. +//! +//! The diffusion function is drawn below. +//! +//! ```notest +//! x ← px +//! x ← x ⊕ ((x ≫ 32) ≫ (x ≫ 60)) +//! x ← px +//! ``` +//! +//! The advantage of having four completely segregated (note that there is no mix round, so they're +//! entirely independent) states is that fast parallelism is possible. For example, if I were to +//! hash 1 TB, I can spawn up four threads which can run independently without _any_ +//! intercommunication or synchronization before the last round. +//! +//! If the diffusion function (f) was cryptographically secure, it would pass cryptoanalysis +//! trivially. This might seem irrelevant, as it clearly isn't cryptographically secure, but it +//! tells us something about the inner semantics. In particular, any diffusion function with +//! sufficient statistical quality will make up a good hash function in this construction. +//! +//! Read [the blog post](http://ticki.github.io/blog/seahash-explained/) for more details. +//! +//! # ASIC version +//! +//! SeaHash is specifically designed such that it can be efficiently implemented in the form of +//! ASIC while only using very few transistors. +//! +//! # Specification +//! +//! See the [`reference`](./reference) module. +//! +//! # Credits +//! +//! Aside for myself (@ticki), there are couple of other people who have helped creating this. +//! Joshua Landau suggested using the [PCG family of diffusions](http://www.pcg-random.org/), +//! created by Melissa E. O'Neill. Sokolov Yura spotted multiple bugs in SeaHash. + +#![warn(missing_docs)] +#![cfg_attr(all(not(test), not(feature = "use_std")), no_std)] +#[cfg(all(not(test), not(feature = "use_std")))] +extern crate core as std; + +pub use buffer::{hash, hash_seeded, State}; +pub use stream::SeaHasher; + +mod buffer; +mod helper; +pub mod reference; +mod stream; + +#[cfg(feature = "use_std")] +mod impl_std; diff --git a/src/reference.rs b/src/reference.rs new file mode 100644 index 0000000..2585014 --- /dev/null +++ b/src/reference.rs @@ -0,0 +1,152 @@ +//! A slow, but clear reference implementation of SeaHash. +//! +//! # Specification +//! +//! The input buffer is padded with null bytes until the length is divisible by 8. +//! +//! We start out with state +//! +//! ```notest +//! a = 0x16f11fe89b0d677c +//! b = 0xb480a793d8e6c86c +//! c = 0x6fe2e5aaf078ebc9 +//! d = 0x14f994a4c5259381 +//! ``` +//! +//! If a seed is given, each of the initial state component are modularly multiplied by the seed. +//! +//! From the stream, we read one 64-bit block (in little-endian) at a time. This number, `n`, +//! determines the new state by: +//! +//! ```notest +//! a' = b +//! b' = c +//! c' = d +//! d' = g(a ⊕ n) +//! ``` +//! +//! `g(x)` is defined as `g(x) = j(h(j(x)))` with `h(x) = (x ≫ 32) ≫ (x ≫ 60)` and `j(x) ≡ px (mod +//! 2^64)` with `p = 0x7ed0e9fa0d94a33`. +//! +//! Let the final state be `(x, y, z, w)`. Then the final result is given by `H = g(x ⊕ y ⊕ z ⊕ w ⊕ +//! l)` where `l` is the number of bytes in the original buffer. + +use helper; + +/// Read an integer in little-endian. +fn read_int(int: &[u8]) -> u64 { + debug_assert!( + int.len() <= 8, + "The buffer length of the integer must be less than or equal to \ + the one of an u64." + ); + + // Start at 0. + let mut x = 0; + for &i in int.iter().rev() { + // Shift up a byte. + x <<= 8; + // Set the lower byte. + x |= i as u64; + } + + x +} + +/// A hash state. +struct State { + /// The `a` substate. + a: u64, + /// The `b` substate. + b: u64, + /// The `c` substate. + c: u64, + /// The `d` substate. + d: u64, +} + +impl State { + /// Write a 64-bit integer to the state. + fn write_u64(&mut self, x: u64) { + let mut a = self.a; + + // Mix `x` into `a`. + a = helper::diffuse(a ^ x); + + // Rotate around. + // _______________________ + // | v + // a <---- b <---- c <---- d + self.a = self.b; + self.b = self.c; + self.c = self.d; + self.d = a; + } + + /// Calculate the final hash. + fn finish(self, total: usize) -> u64 { + // Even though XORing is commutative, it doesn't matter, because the state vector's initial + // components are mutually distinct, and thus swapping even and odd chunks will affect the + // result, because it is sensitive to the initial condition. To add discreteness, we + // diffuse. + helper::diffuse( + self.a ^ self.b ^ self.c ^ self.d + // We XOR in the number of written bytes to make it zero-sensitive when excessive bytes + // are written (0u32.0u8 ≠ 0u16.0u8). + ^ total as u64, + ) + } + + /// Create a new state with some initial values (seed). + fn with_seeds(k1: u64, k2: u64, k3: u64, k4: u64) -> State { + State { + // These values are randomly generated. + a: k1, + b: k2, + c: k3, + d: k4, + } + } +} + +/// A reference implementation of SeaHash. +/// +/// This is bloody slow when compared to the optimized version. This is because SeaHash was +/// specifically designed to take all sorts of hardware and software hacks into account to achieve +/// maximal performance, but this makes code significantly less readable. As such, this version has +/// only one goal: to make the algorithm readable and understandable. +pub fn hash(buf: &[u8]) -> u64 { + hash_seeded( + buf, + 0x16f11fe89b0d677c, + 0xb480a793d8e6c86c, + 0x6fe2e5aaf078ebc9, + 0x14f994a4c5259381, + ) +} + +/// The seeded version of the reference implementation. +pub fn hash_seeded(buf: &[u8], k1: u64, k2: u64, k3: u64, k4: u64) -> u64 { + // Initialize the state. + let mut state = State::with_seeds(k1, k2, k3, k4); + + // Partition the rounded down buffer into chunks of 8 bytes, and iterate over them. The last + // block might not be 8 bytes long. + for int in buf.chunks(8) { + // Read the chunk into an integer and write into the state. + state.write_u64(read_int(int)); + } + + // Finish the hash state and return the final value. + state.finish(buf.len()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn shakespear() { + assert_eq!(hash(b"to be or not to be"), 1988685042348123509); + } +} diff --git a/src/stream.rs b/src/stream.rs new file mode 100644 index 0000000..cf35ebc --- /dev/null +++ b/src/stream.rs @@ -0,0 +1,284 @@ +use std::hash::Hasher; +use std::slice; + +use helper; + +/// The streaming version of the algorithm. +#[derive(Clone, Copy)] +pub struct SeaHasher { + /// The state of the hasher. + state: (u64, u64, u64, u64), + /// The number of bytes we have written in total + written: u64, + /// Our tail + tail: u64, + /// The number of bytes in the tail + ntail: usize, +} + +impl Default for SeaHasher { + fn default() -> SeaHasher { + SeaHasher::with_seeds( + 0x16f11fe89b0d677c, + 0xb480a793d8e6c86c, + 0x6fe2e5aaf078ebc9, + 0x14f994a4c5259381, + ) + } +} + +impl SeaHasher { + /// Create a new `SeaHasher` with default state. + pub fn new() -> SeaHasher { + SeaHasher::default() + } + + /// Construct a new `SeaHasher` given some seed. + /// + /// For maximum quality, these seeds should be chosen at random. + pub fn with_seeds(k1: u64, k2: u64, k3: u64, k4: u64) -> SeaHasher { + SeaHasher { + state: (k1, k2, k3, k4), + written: 0, + tail: 0, + ntail: 0, + } + } + + #[inline(always)] + fn push(&mut self, x: u64) { + let a = helper::diffuse(self.state.0 ^ x); + self.state.0 = self.state.1; + self.state.1 = self.state.2; + self.state.2 = self.state.3; + self.state.3 = a; + self.written += 8; + } + + #[inline(always)] + fn push_bytes(&mut self, bytes: &[u8]) { + // The start of the bytes that aren't in the tail + let copied = core::cmp::min(8 - self.ntail, bytes.len()); + unsafe { + let mut this = self.tail.to_le_bytes(); + let mut ptr = bytes.as_ptr(); + ptr.copy_to_nonoverlapping(this.as_mut_ptr().add(self.ntail), copied); + // It will be at most 8 + if copied + self.ntail != 8 { + self.ntail += copied; + self.tail = u64::from_le_bytes(this); + } else { + self.push(u64::from_le_bytes(this)); + self.ntail = 0; + self.tail = 0; + + // We've done the existing tail, now just do the rest in chunks of 4 x u64. + ptr = ptr.offset(copied as isize); + let end_ptr = ptr.offset((bytes.len() - copied) as isize & !0x1F); + while end_ptr > ptr { + self.state.0 = helper::diffuse(self.state.0 ^ helper::read_u64(ptr)); + self.state.1 = helper::diffuse(self.state.1 ^ helper::read_u64(ptr.offset(8))); + self.state.2 = helper::diffuse(self.state.2 ^ helper::read_u64(ptr.offset(16))); + self.state.3 = helper::diffuse(self.state.3 ^ helper::read_u64(ptr.offset(24))); + + ptr = ptr.offset(32); + self.written += 32; + } + let mut excessive = bytes.len() + bytes.as_ptr() as usize - ptr as usize; + match excessive { + 0 => { + // input was a multiple of 4 x u64 bytes long; no new tail bytes. + } + 1..=7 => { + self.tail = + helper::read_int(slice::from_raw_parts(ptr as *const u8, excessive)); + self.ntail = excessive; + // self.written does not need to be updated as we only gathered self.tail + // bytes after larger chunks. + } + 8 => { + self.push(helper::read_u64(ptr)); + // self.written is updated by self.push + } + 9..=15 => { + self.push(helper::read_u64(ptr)); + excessive -= 8; + self.tail = + helper::read_int(slice::from_raw_parts(ptr.offset(8), excessive)); + self.ntail = excessive; + // self.written is updated by self.push + } + 16 => { + let a = helper::diffuse(self.state.0 ^ helper::read_u64(ptr)); + let b = helper::diffuse(self.state.1 ^ helper::read_u64(ptr.offset(8))); + // rotate + self.state.0 = self.state.2; + self.state.1 = self.state.3; + self.state.2 = a; + self.state.3 = b; + self.written += 16; + } + 17..=23 => { + let a = helper::diffuse(self.state.0 ^ helper::read_u64(ptr)); + let b = helper::diffuse(self.state.1 ^ helper::read_u64(ptr.offset(8))); + // rotate + self.state.0 = self.state.2; + self.state.1 = self.state.3; + self.state.2 = a; + self.state.3 = b; + excessive -= 16; + self.tail = + helper::read_int(slice::from_raw_parts(ptr.offset(16), excessive)); + self.ntail = excessive; + self.written += 16; + } + 24 => { + let a = helper::diffuse(self.state.0 ^ helper::read_u64(ptr)); + let b = helper::diffuse(self.state.1 ^ helper::read_u64(ptr.offset(8))); + let c = helper::diffuse(self.state.2 ^ helper::read_u64(ptr.offset(16))); + self.state.0 = self.state.3; + self.state.1 = a; + self.state.2 = b; + self.state.3 = c; + self.written += 24; + } + _ => { + let a = helper::diffuse(self.state.0 ^ helper::read_u64(ptr)); + let b = helper::diffuse(self.state.1 ^ helper::read_u64(ptr.offset(8))); + let c = helper::diffuse(self.state.2 ^ helper::read_u64(ptr.offset(16))); + self.state.0 = self.state.3; + self.state.1 = a; + self.state.2 = b; + self.state.3 = c; + excessive -= 24; + self.tail = + helper::read_int(slice::from_raw_parts(ptr.offset(24), excessive)); + self.ntail = excessive; + self.written += 24; + } + } + } + } + } +} + +impl Hasher for SeaHasher { + fn finish(&self) -> u64 { + let a = if self.ntail > 0 { + let tail = helper::read_int(&self.tail.to_le_bytes()[..self.ntail]); + helper::diffuse(self.state.0 ^ tail) + } else { + self.state.0 + }; + helper::diffuse( + a ^ self.state.1 ^ self.state.2 ^ self.state.3 ^ self.written + self.ntail as u64, + ) + } + + fn write(&mut self, bytes: &[u8]) { + self.push_bytes(bytes) + } + + fn write_u64(&mut self, n: u64) { + self.write(&n.to_le_bytes()) + } + + fn write_u8(&mut self, n: u8) { + self.write(&n.to_le_bytes()) + } + + fn write_u16(&mut self, n: u16) { + self.write(&n.to_le_bytes()) + } + + fn write_u32(&mut self, n: u32) { + self.write(&n.to_le_bytes()) + } + + fn write_usize(&mut self, n: usize) { + self.write(&n.to_le_bytes()) + } + + fn write_i64(&mut self, n: i64) { + self.write(&n.to_le_bytes()) + } + + fn write_i8(&mut self, n: i8) { + self.write(&n.to_le_bytes()) + } + + fn write_i16(&mut self, n: i16) { + self.write(&n.to_le_bytes()) + } + + fn write_i32(&mut self, n: i32) { + self.write(&n.to_le_bytes()) + } + + fn write_isize(&mut self, n: isize) { + self.write(&n.to_le_bytes()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::hash_seeded; + use std::hash::Hasher; + + #[test] + fn chunked_equiv() { + let test_buf: &[u8] = &[ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ]; + + let mut stream_hasher1 = SeaHasher::default(); + Hasher::write(&mut stream_hasher1, test_buf); + + let mut stream_hasher2 = SeaHasher::default(); + Hasher::write(&mut stream_hasher2, &test_buf[..8]); + Hasher::write(&mut stream_hasher2, &test_buf[8..]); + + let mut stream_hasher3 = SeaHasher::default(); + Hasher::write(&mut stream_hasher3, &test_buf[..3]); + Hasher::write(&mut stream_hasher3, &test_buf[3..]); + + let mut stream_hasher4 = SeaHasher::default(); + Hasher::write_u16(&mut stream_hasher4, 0xffff); + Hasher::write_u16(&mut stream_hasher4, 0xffff); + Hasher::write_u32(&mut stream_hasher4, 0xffffffff); + Hasher::write_u64(&mut stream_hasher4, 0); + + assert_eq!(stream_hasher1.finish(), stream_hasher2.finish()); + assert_eq!(stream_hasher1.finish(), stream_hasher3.finish()); + assert_eq!(stream_hasher1.finish(), stream_hasher4.finish()); + } + + #[test] + fn match_optimized() { + let test_buf: &[u8] = &[ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ]; + + let mut sea_hasher = SeaHasher::with_seeds( + 0xe7b0c93ca8525013, + 0x011d02b854ae8182, + 0x7bcc5cf9c39cec76, + 0xfa336285d102d083, + ); + sea_hasher.write(test_buf); + let stream_hash = sea_hasher.finish(); + + let buffer_hash = hash_seeded( + test_buf, + 0xe7b0c93ca8525013, + 0x011d02b854ae8182, + 0x7bcc5cf9c39cec76, + 0xfa336285d102d083, + ); + + assert_eq!(buffer_hash, stream_hash) + } +} diff --git a/tests/chunking.rs b/tests/chunking.rs new file mode 100644 index 0000000..4416f1f --- /dev/null +++ b/tests/chunking.rs @@ -0,0 +1,67 @@ +extern crate seahash; +use seahash::SeaHasher as H; + +use std::hash::Hasher; + +#[test] +fn hash_chunking_vs_not() { + // originally from https://gitlab.redox-os.org/redox-os/seahash/issues/5 + let c1: &[u8] = b"This hashing algorithm was extracted from the Rustc compiler."; + let c2: &[u8] = + b" This is the same hashing algoirthm used for some internal operations in FireFox."; + let c3: &[u8] = b" The strength of this algorithm is in hashing 8 bytes at a time on 64-bit platforms, where the FNV algorithm works on one byte at a time."; + + let mut h1 = H::default(); + h1.write(c1); + h1.write(c2); + h1.write(c3); + let hash1 = h1.finish(); + + let mut c4 = Vec::::new(); + c4.extend_from_slice(c1); + c4.extend_from_slice(c2); + c4.extend_from_slice(c3); + + let mut h2 = H::default(); + h2.write(&c4); + let hash2 = h2.finish(); + + let reference = seahash::reference::hash(&c4); + let buffer = seahash::hash(&c4); + + println!("hash1: {:016x}", hash1); + println!("hash2: {:016x}", hash2); + println!("ref : {:016x}", reference); + println!("buf : {:016x}", buffer); + + assert_eq!(hash1, hash2); + assert_eq!(hash1, reference); + assert_eq!(hash1, buffer); + assert_eq!(hash1, 0xa06e72e1b06144a0); +} + +#[test] +fn test_different_chunk_sizes() { + let v = { + let c1: &[u8] = b"This hashing algorithm was extracted from the Rustc compiler."; + let c2: &[u8] = + b" This is the same hashing algoirthm used for some internal operations in FireFox."; + let c3: &[u8] = b" The strength of this algorithm is in hashing 8 bytes at a time on 64-bit platforms, where the FNV algorithm works on one byte at a time."; + + [c1, c2, c3].concat() + }; + + let mut h1 = H::default(); + h1.write(&v); + let h1 = h1.finish(); + + for chunk_len in 1..v.len() { + let mut h2 = H::default(); + for w in v.chunks(chunk_len) { + h2.write(w); + } + let h2 = h2.finish(); + + assert_eq!(h1, h2, "failed with chunk_len={}", chunk_len); + } +} diff --git a/tests/quickchecks.rs b/tests/quickchecks.rs new file mode 100644 index 0000000..f9f11a8 --- /dev/null +++ b/tests/quickchecks.rs @@ -0,0 +1,47 @@ +extern crate seahash; + +#[macro_use] +extern crate quickcheck; +use quickcheck::TestResult; + +use seahash::hash; +use seahash::reference::hash as reference; +use seahash::SeaHasher; +use std::hash::Hasher; +use std::num::{NonZeroU8, NonZeroUsize}; + +quickcheck! { + #[cfg_attr(miri, ignore)] // very slow to run on miri + fn chunked_matches_buffered(xs: Vec, chunk_size: NonZeroUsize, times: NonZeroU8, additional: u8) -> TestResult { + let target_size = xs.len() * times.get() as usize + additional as usize; + if xs.is_empty() || target_size > 10_000_000 { + TestResult::discard() + } else { + let xs = xs.into_iter() + .cycle() + // the vecs produced by quickcheck are perhaps a bit small by default. + // additional should add some noise to avoid only getting nice even lengths. + .take(target_size) + .collect::>(); + + // write all at once + let mut h0 = SeaHasher::default(); + h0.write(&xs); + let h0 = h0.finish(); + + // write in chunks + let mut h1 = SeaHasher::default(); + for chunk in xs.chunks(chunk_size.get()) { + h1.write(chunk); + } + let h1 = h1.finish(); + + // compare all, including to buffered and reference + let outcome = h0 == h1 + && h0 == hash(&xs) + && h0 == reference(&xs); + + TestResult::from_bool(outcome) + } + } +} -- 2.7.4