Update to Stockfish 15

2024-11-23 03:21:20 +01:00 · 2022-04-23 14:00:50 +02:00 · 2022-04-23 14:00:50 +02:00 · 07931e96a5
commit 07931e96a5
parent bc1c8a2c29
55 changed files with 1812 additions and 1255 deletions
--- a/DroidFishApp/src/main/assets/nn-6877cd24400e.nnue
+++ b/DroidFishApp/src/main/assets/nn-6877cd24400e.nnue
--- a/DroidFishApp/src/main/cpp/stockfish/Android.mk
+++ b/DroidFishApp/src/main/cpp/stockfish/Android.mk
@ -5,7 +5,7 @@ SF_SRC_FILES := \
 	bitbase.cpp endgame.cpp material.cpp movepick.cpp position.cpp timeman.cpp \
 	tune.cpp ucioption.cpp \
 	bitboard.cpp evaluate.cpp misc.cpp search.cpp tt.cpp syzygy/tbprobe.cpp \
-	nnue/evaluate_nnue.cpp nnue/features/half_ka_v2.cpp
+	nnue/evaluate_nnue.cpp nnue/features/half_ka_v2_hm.cpp

 MY_ARCH_DEF :=
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
--- a/DroidFishApp/src/main/cpp/stockfish/benchmark.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/benchmark.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -87,6 +87,7 @@ const vector<string> Defaults = {
  // Chess 960
  "setoption name UCI_Chess960 value true",
  "bbqnnrkr/pppppppp/8/8/8/8/PPPPPPPP/BBQNNRKR w HFhf - 0 1 moves g2g3 d7d5 d2d4 c8h3 c1g5 e8d6 g5e7 f7f6",
+  "nqbnrkrb/pppppppp/8/8/8/8/PPPPPPPP/NQBNRKRB w KQkq - 0 1",
  "setoption name UCI_Chess960 value false"
 };

--- a/DroidFishApp/src/main/cpp/stockfish/bitbase.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/bitbase.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/bitboard.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/bitboard.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/bitboard.h
+++ b/DroidFishApp/src/main/cpp/stockfish/bitboard.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/endgame.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/endgame.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/endgame.h
+++ b/DroidFishApp/src/main/cpp/stockfish/endgame.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/evaluate.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/evaluate.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -61,7 +61,7 @@ namespace Stockfish {
 namespace Eval {

  bool useNNUE;
-  string eval_file_loaded = "None";
+  string currentEvalFileName = "None";

  /// NNUE::init() tries to load a NNUE network at startup time, or when the engine
  /// receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue"
@ -78,6 +78,8 @@ namespace Eval {
        return;

    string eval_file = string(Options["EvalFile"]);
+    if (eval_file.empty())
+        eval_file = EvalFileDefaultName;

    #if defined(DEFAULT_NNUE_DIRECTORY)
    #define stringify2(x) #x
@ -88,13 +90,13 @@ namespace Eval {
    #endif

    for (string directory : dirs)
-        if (eval_file_loaded != eval_file)
+        if (currentEvalFileName != eval_file)
        {
            if (directory != "<internal>")
            {
                ifstream stream(directory + eval_file, ios::binary);
                if (load_eval(eval_file, stream))
-                    eval_file_loaded = eval_file;
+                    currentEvalFileName = eval_file;
            }

            if (directory == "<internal>" && eval_file == EvalFileDefaultName)
@ -106,30 +108,29 @@ namespace Eval {

                MemoryBuffer buffer(const_cast<char*>(reinterpret_cast<const char*>(gEmbeddedNNUEData)),
                                    size_t(gEmbeddedNNUESize));
+                (void) gEmbeddedNNUEEnd; // Silence warning on unused variable

                istream stream(&buffer);
                if (load_eval(eval_file, stream))
-                    eval_file_loaded = eval_file;
+                    currentEvalFileName = eval_file;
            }
        }
-    if (eval_file_loaded != eval_file)
-        eval_file_loaded = "";
  }

  /// NNUE::verify() verifies that the last net used was loaded successfully
  void NNUE::verify() {

    string eval_file = string(Options["EvalFile"]);
+    if (eval_file.empty())
+        eval_file = EvalFileDefaultName;

-    if (useNNUE && eval_file_loaded != eval_file)
+    if (useNNUE && currentEvalFileName != eval_file)
    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);

        string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
        string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
        string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + string(defaults["EvalFile"]);
+        string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(EvalFileDefaultName);
        string msg5 = "The engine will be terminated now.";

        sync_cout << "info string ERROR: " << msg1 << sync_endl;
@ -192,17 +193,17 @@ using namespace Trace;
 namespace {

  // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1    =  Value(1565);
-  constexpr Value LazyThreshold2    =  Value(1102);
+  constexpr Value LazyThreshold1    =  Value(3631);
+  constexpr Value LazyThreshold2    =  Value(2084);
  constexpr Value SpaceThreshold    =  Value(11551);

  // KingAttackWeights[PieceType] contains king attack weights by piece type
-  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
+  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 76, 46, 45, 14 };

  // SafeCheck[PieceType][single/multiple] contains safe check bonus by piece type,
  // higher if multiple safe checks are possible for that piece type.
  constexpr int SafeCheck[][2] = {
-      {}, {}, {803, 1292}, {639, 974}, {1087, 1878}, {759, 1132}
+      {}, {}, {805, 1292}, {650, 984}, {1071, 1886}, {730, 1128}
  };

 #define S(mg, eg) make_score(mg, eg)
@ -228,58 +229,58 @@ namespace {
  // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on
  // squares of the same color as our bishop.
  constexpr Score BishopPawns[int(FILE_NB) / 2] = {
-    S(3, 8), S(3, 9), S(2, 8), S(3, 8)
+    S(3, 8), S(3, 9), S(2, 7), S(3, 7)
  };

  // KingProtector[knight/bishop] contains penalty for each distance unit to own king
-  constexpr Score KingProtector[] = { S(8, 9), S(6, 9) };
+  constexpr Score KingProtector[] = { S(9, 9), S(7, 9) };

  // Outpost[knight/bishop] contains bonuses for each knight or bishop occupying a
  // pawn protected square on rank 4 to 6 which is also safe from a pawn attack.
-  constexpr Score Outpost[] = { S(57, 38), S(31, 24) };
+  constexpr Score Outpost[] = { S(54, 34), S(31, 25) };

  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
  constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(7, 27), S(16, 32), S(17, 40), S(64, 71), S(170, 174), S(278, 262)
+    S(0, 0), S(2, 38), S(15, 36), S(22, 50), S(64, 81), S(166, 184), S(284, 269)
  };

  constexpr Score RookOnClosedFile = S(10, 5);
-  constexpr Score RookOnOpenFile[] = { S(19, 6), S(47, 26) };
+  constexpr Score RookOnOpenFile[] = { S(18, 8), S(49, 26) };

  // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
  // which piece type attacks which one. Attacks on lesser pieces which are
  // pawn-defended are not considered.
  constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
-    S(0, 0), S(5, 32), S(55, 41), S(77, 56), S(89, 119), S(79, 162)
+    S(0, 0), S(6, 37), S(64, 50), S(82, 57), S(103, 130), S(81, 163)
  };

  constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
-    S(0, 0), S(3, 44), S(37, 68), S(42, 60), S(0, 39), S(58, 43)
+    S(0, 0), S(3, 44), S(36, 71), S(44, 59), S(0, 39), S(60, 39)
  };

  constexpr Value CorneredBishop = Value(50);

  // Assorted bonuses and penalties
-  constexpr Score UncontestedOutpost  = S(  1, 10);
+  constexpr Score UncontestedOutpost  = S(  0, 10);
  constexpr Score BishopOnKingRing    = S( 24,  0);
  constexpr Score BishopXRayPawns     = S(  4,  5);
  constexpr Score FlankAttacks        = S(  8,  0);
-  constexpr Score Hanging             = S( 69, 36);
+  constexpr Score Hanging             = S( 72, 40);
  constexpr Score KnightOnQueen       = S( 16, 11);
  constexpr Score LongDiagonalBishop  = S( 45,  0);
  constexpr Score MinorBehindPawn     = S( 18,  3);
-  constexpr Score PassedFile          = S( 11,  8);
-  constexpr Score PawnlessFlank       = S( 17, 95);
-  constexpr Score ReachableOutpost    = S( 31, 22);
-  constexpr Score RestrictedPiece     = S(  7,  7);
+  constexpr Score PassedFile          = S( 13,  8);
+  constexpr Score PawnlessFlank       = S( 19, 97);
+  constexpr Score ReachableOutpost    = S( 33, 19);
+  constexpr Score RestrictedPiece     = S(  6,  7);
  constexpr Score RookOnKingRing      = S( 16,  0);
-  constexpr Score SliderOnQueen       = S( 60, 18);
-  constexpr Score ThreatByKing        = S( 24, 89);
+  constexpr Score SliderOnQueen       = S( 62, 21);
+  constexpr Score ThreatByKing        = S( 24, 87);
  constexpr Score ThreatByPawnPush    = S( 48, 39);
-  constexpr Score ThreatBySafePawn    = S(173, 94);
+  constexpr Score ThreatBySafePawn    = S(167, 99);
  constexpr Score TrappedRook         = S( 55, 13);
  constexpr Score WeakQueenProtection = S( 14,  0);
-  constexpr Score WeakQueen           = S( 56, 15);
+  constexpr Score WeakQueen           = S( 57, 19);


 #undef S
@ -988,7 +989,9 @@ namespace {

    // Early exit if score is high
    auto lazy_skip = [&](Value lazyThreshold) {
-        return abs(mg_value(score) + eg_value(score)) / 2 > lazyThreshold + pos.non_pawn_material() / 64;
+        return abs(mg_value(score) + eg_value(score)) >   lazyThreshold
+                                                        + std::abs(pos.this_thread()->bestValue) * 5 / 4
+                                                        + pos.non_pawn_material() / 32;
    };

    if (lazy_skip(LazyThreshold1))
@ -1053,26 +1056,22 @@ make_v:

    if (   pos.piece_on(SQ_A1) == W_BISHOP
        && pos.piece_on(SQ_B2) == W_PAWN)
-        correction += !pos.empty(SQ_B3) ? -CorneredBishop * 4
-                                        : -CorneredBishop * 3;
+        correction -= CorneredBishop;

    if (   pos.piece_on(SQ_H1) == W_BISHOP
        && pos.piece_on(SQ_G2) == W_PAWN)
-        correction += !pos.empty(SQ_G3) ? -CorneredBishop * 4
-                                        : -CorneredBishop * 3;
+        correction -= CorneredBishop;

    if (   pos.piece_on(SQ_A8) == B_BISHOP
        && pos.piece_on(SQ_B7) == B_PAWN)
-        correction += !pos.empty(SQ_B6) ? CorneredBishop * 4
-                                        : CorneredBishop * 3;
+        correction += CorneredBishop;

    if (   pos.piece_on(SQ_H8) == B_BISHOP
        && pos.piece_on(SQ_G7) == B_PAWN)
-        correction += !pos.empty(SQ_G6) ? CorneredBishop * 4
-                                        : CorneredBishop * 3;
+        correction += CorneredBishop;

-    return pos.side_to_move() == WHITE ?  Value(correction)
-                                       : -Value(correction);
+    return pos.side_to_move() == WHITE ?  Value(3 * correction)
+                                       : -Value(3 * correction);
  }

 } // namespace Eval
@ -1084,38 +1083,37 @@ make_v:
 Value Eval::evaluate(const Position& pos) {

  Value v;
+  bool useClassical = false;

-  if (!Eval::useNNUE)
-      v = Evaluation<NO_TRACE>(pos).value();
-  else
+  // Deciding between classical and NNUE eval (~10 Elo): for high PSQ imbalance we use classical,
+  // but we switch to NNUE during long shuffling or with high material on the board.
+  if (  !useNNUE
+      || ((pos.this_thread()->depth > 9 || pos.count<ALL_PIECES>() > 7) &&
+          abs(eg_value(pos.psq_score())) * 5 > (856 + pos.non_pawn_material() / 64) * (10 + pos.rule50_count())))
  {
-      // Scale and shift NNUE for compatibility with search and classical evaluation
-      auto  adjusted_NNUE = [&]()
-      {
-         int scale =   903
-                     + 32 * pos.count<PAWN>()
-                     + 32 * pos.non_pawn_material() / 1024;
+      v = Evaluation<NO_TRACE>(pos).value();          // classical
+      useClassical = abs(v) >= 297;
+  }

-         Value nnue = NNUE::evaluate(pos, true) * scale / 1024;
+  // If result of a classical evaluation is much lower than threshold fall back to NNUE
+  if (useNNUE && !useClassical)
+  {
+       Value nnue     = NNUE::evaluate(pos, true);     // NNUE
+       int scale      = 1036 + 22 * pos.non_pawn_material() / 1024;
+       Color stm      = pos.side_to_move();
+       Value optimism = pos.this_thread()->optimism[stm];
+       Value psq      = (stm == WHITE ? 1 : -1) * eg_value(pos.psq_score());
+       int complexity = 35 * abs(nnue - psq) / 256;

-         if (pos.is_chess960())
-             nnue += fix_FRC(pos);
+       optimism = optimism * (44 + complexity) / 31;
+       v = (nnue + optimism) * scale / 1024 - optimism;

-         return nnue;
-      };
-
-      // If there is PSQ imbalance we use the classical eval, but we switch to
-      // NNUE eval faster when shuffling or if the material on the board is high.
-      int r50 = pos.rule50_count();
-      Value psq = Value(abs(eg_value(pos.psq_score())));
-      bool classical = psq * 5 > (750 + pos.non_pawn_material() / 64) * (5 + r50);
-
-      v = classical ? Evaluation<NO_TRACE>(pos).value()  // classical
-                    : adjusted_NNUE();                   // NNUE
+       if (pos.is_chess960())
+           v += fix_FRC(pos);
  }

  // Damp down the evaluation linearly when shuffling
-  v = v * (100 - pos.rule50_count()) / 100;
+  v = v * (195 - pos.rule50_count()) / 211;

  // Guarantee evaluation does not hit the tablebase range
  v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
@ -1140,7 +1138,12 @@ std::string Eval::trace(Position& pos) {

  std::memset(scores, 0, sizeof(scores));

-  pos.this_thread()->trend = SCORE_ZERO; // Reset any dynamic contempt
+  // Reset any global variable used in eval
+  pos.this_thread()->depth           = 0;
+  pos.this_thread()->trend           = SCORE_ZERO;
+  pos.this_thread()->bestValue       = VALUE_ZERO;
+  pos.this_thread()->optimism[WHITE] = VALUE_ZERO;
+  pos.this_thread()->optimism[BLACK] = VALUE_ZERO;

  v = Evaluation<TRACE>(pos).value();

--- a/DroidFishApp/src/main/cpp/stockfish/evaluate.h
+++ b/DroidFishApp/src/main/cpp/stockfish/evaluate.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -34,12 +34,12 @@ namespace Eval {
  Value evaluate(const Position& pos);

  extern bool useNNUE;
-  extern std::string eval_file_loaded;
+  extern std::string currentEvalFileName;

  // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
  // for the build process (profile-build and fishtest) to work. Do not change the
  // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-3475407dc199.nnue"
+  #define EvalFileDefaultName   "nn-6877cd24400e.nnue"

  namespace NNUE {

--- a/DroidFishApp/src/main/cpp/stockfish/main.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/main.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/material.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/material.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/material.h
+++ b/DroidFishApp/src/main/cpp/stockfish/material.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/misc.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/misc.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -36,6 +36,8 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
                      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
 typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
 typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
+typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
+typedef WORD(*fun5_t)();
 }
 #endif

@ -67,7 +69,7 @@ namespace {

 /// Version number. If Version is left empty, then compile date in the format
 /// DD-MM-YY and show in engine_info.
-const string Version = "14";
+const string Version = "15";

 /// Our fancy logging facility. The trick here is to replace cin.rdbuf() and
 /// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We
@ -110,7 +112,14 @@ public:

    static Logger l;

-    if (!fname.empty() && !l.file.is_open())
+    if (l.file.is_open())
+    {
+        cout.rdbuf(l.out.buf);
+        cin.rdbuf(l.in.buf);
+        l.file.close();
+    }
+
+    if (!fname.empty())
    {
        l.file.open(fname, ifstream::out);

@ -123,12 +132,6 @@ public:
        cin.rdbuf(&l.in);
        cout.rdbuf(&l.out);
    }
-    else if (fname.empty() && l.file.is_open())
-    {
-        cout.rdbuf(l.out.buf);
-        cin.rdbuf(l.in.buf);
-        l.file.close();
-    }
  }
 };

@ -378,6 +381,7 @@ void std_aligned_free(void* ptr) {
 static void* aligned_large_pages_alloc_windows(size_t allocSize) {

  #if !defined(_WIN64)
+    (void)allocSize; // suppress unused-parameter compiler warning
    return nullptr;
  #else

@ -493,11 +497,11 @@ void bindThisThread(size_t) {}

 #else

-/// best_group() retrieves logical processor information using Windows specific
-/// API and returns the best group id for the thread with index idx. Original
+/// best_node() retrieves logical processor information using Windows specific
+/// API and returns the best node id for the thread with index idx. Original
 /// code from Texel by Peter Österlund.

-int best_group(size_t idx) {
+int best_node(size_t idx) {

  int threads = 0;
  int nodes = 0;
@ -511,7 +515,8 @@ int best_group(size_t idx) {
  if (!fun1)
      return -1;

-  // First call to get returnLength. We expect it to fail due to null buffer
+  // First call to GetLogicalProcessorInformationEx() to get returnLength.
+  // We expect the call to fail due to null buffer.
  if (fun1(RelationAll, nullptr, &returnLength))
      return -1;

@ -519,7 +524,7 @@ int best_group(size_t idx) {
  SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr;
  ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)malloc(returnLength);

-  // Second call, now we expect to succeed
+  // Second call to GetLogicalProcessorInformationEx(), now we expect to succeed
  if (!fun1(RelationAll, buffer, &returnLength))
  {
      free(buffer);
@ -569,22 +574,38 @@ int best_group(size_t idx) {
 void bindThisThread(size_t idx) {

  // Use only local variables to be thread-safe
-  int group = best_group(idx);
+  int node = best_node(idx);

-  if (group == -1)
+  if (node == -1)
      return;

  // Early exit if the needed API are not available at runtime
  HMODULE k32 = GetModuleHandle("Kernel32.dll");
  auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
  auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
+  auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
+  auto fun5 = (fun5_t)(void(*)())GetProcAddress(k32, "GetMaximumProcessorGroupCount");

  if (!fun2 || !fun3)
      return;

-  GROUP_AFFINITY affinity;
-  if (fun2(group, &affinity))
-      fun3(GetCurrentThread(), &affinity, nullptr);
+  if (!fun4 || !fun5)
+  {
+      GROUP_AFFINITY affinity;
+      if (fun2(node, &affinity))                                                 // GetNumaNodeProcessorMaskEx
+          fun3(GetCurrentThread(), &affinity, nullptr);                          // SetThreadGroupAffinity
+  }
+  else
+  {
+      // If a numa node has more than one processor group, we assume they are
+      // sized equal and we spread threads evenly across the groups.
+      USHORT elements, returnedElements;
+      elements = fun5();                                                         // GetMaximumProcessorGroupCount
+      GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(elements * sizeof(GROUP_AFFINITY));
+      if (fun4(node, affinity, elements, &returnedElements))                     // GetNumaNodeProcessorMask2
+          fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);  // SetThreadGroupAffinity
+      free(affinity);
+  }
 }

 #endif
--- a/DroidFishApp/src/main/cpp/stockfish/misc.h
+++ b/DroidFishApp/src/main/cpp/stockfish/misc.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -85,19 +85,30 @@ static inline const union { uint32_t i; char c[4]; } Le = { 0x01020304 };
 static inline const bool IsLittleEndian = (Le.c[0] == 4);


-template <typename T>
-class ValueListInserter {
-public:
-  ValueListInserter(T* v, std::size_t& s) :
-    values(v),
-    size(&s)
-  {
-  }
+// RunningAverage : a class to calculate a running average of a series of values.
+// For efficiency, all computations are done with integers.
+class RunningAverage {
+  public:

-  void push_back(const T& value) { values[(*size)++] = value; }
-private:
-  T* values;
-  std::size_t* size;
+      // Reset the running average to rational value p / q
+      void set(int64_t p, int64_t q)
+        { average = p * PERIOD * RESOLUTION / q; }
+
+      // Update average with value v
+      void update(int64_t v)
+        { average = RESOLUTION * v + (PERIOD - 1) * average / PERIOD; }
+
+      // Test if average is strictly greater than rational a / b
+      bool is_greater(int64_t a, int64_t b) const
+        { return b * average > a * (PERIOD * RESOLUTION); }
+
+      int64_t value() const
+        { return average / (PERIOD * RESOLUTION); }
+
+  private :
+      static constexpr int64_t PERIOD     = 4096;
+      static constexpr int64_t RESOLUTION = 1024;
+      int64_t average;
 };

 template <typename T, std::size_t MaxSize>
@ -113,7 +124,6 @@ public:
  const T& operator[](std::size_t index) const { return values_[index]; }
  const T* begin() const { return values_; }
  const T* end() const { return values_ + size_; }
-  operator ValueListInserter<T>() { return ValueListInserter(values_, size_); }

  void swap(ValueList& other) {
    const std::size_t maxSize = std::max(size_, other.size_);
@ -128,6 +138,34 @@ private:
  std::size_t size_ = 0;
 };

+
+/// sigmoid(t, x0, y0, C, P, Q) implements a sigmoid-like function using only integers,
+/// with the following properties:
+///
+///  -  sigmoid is centered in (x0, y0)
+///  -  sigmoid has amplitude [-P/Q , P/Q] instead of [-1 , +1]
+///  -  limit is (y0 - P/Q) when t tends to -infinity
+///  -  limit is (y0 + P/Q) when t tends to +infinity
+///  -  the slope can be adjusted using C > 0, smaller C giving a steeper sigmoid
+///  -  the slope of the sigmoid when t = x0 is P/(Q*C)
+///  -  sigmoid is increasing with t when P > 0 and Q > 0
+///  -  to get a decreasing sigmoid, change sign of P
+///  -  mean value of the sigmoid is y0
+///
+/// Use <https://www.desmos.com/calculator/jhh83sqq92> to draw the sigmoid
+
+inline int64_t sigmoid(int64_t t, int64_t x0,
+                                  int64_t y0,
+                                  int64_t  C,
+                                  int64_t  P,
+                                  int64_t  Q)
+{
+   assert(C > 0);
+   assert(Q != 0);
+   return y0 + P * (t-x0) / (Q * (std::abs(t-x0) + C)) ;
+}
+
+
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
 /// to the public domain by Sebastiano Vigna (2014).
--- a/DroidFishApp/src/main/cpp/stockfish/movegen.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/movegen.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -52,9 +52,9 @@ namespace {
    constexpr Direction UpRight  = (Us == WHITE ? NORTH_EAST : SOUTH_WEST);
    constexpr Direction UpLeft   = (Us == WHITE ? NORTH_WEST : SOUTH_EAST);

-    const Bitboard emptySquares = Type == QUIETS || Type == QUIET_CHECKS ? target : ~pos.pieces();
-    const Bitboard enemies      = Type == EVASIONS ? pos.checkers()
-                                : Type == CAPTURES ? target : pos.pieces(Them);
+    const Bitboard emptySquares = ~pos.pieces();
+    const Bitboard enemies      =  Type == EVASIONS ? pos.checkers()
+                                                    : pos.pieces(Them);

    Bitboard pawnsOn7    = pos.pieces(Us, PAWN) &  TRank7BB;
    Bitboard pawnsNotOn7 = pos.pieces(Us, PAWN) & ~TRank7BB;
--- a/DroidFishApp/src/main/cpp/stockfish/movegen.h
+++ b/DroidFishApp/src/main/cpp/stockfish/movegen.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/movepick.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/movepick.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -18,6 +18,7 @@

 #include <cassert>

+#include "bitboard.h"
 #include "movepick.h"

 namespace Stockfish {
@ -56,11 +57,14 @@ namespace {
 /// ordering is at the current node.

 /// MovePicker constructor for the main search
-MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHistory* mh, const LowPlyHistory* lp,
-                       const CapturePieceToHistory* cph, const PieceToHistory** ch, Move cm, const Move* killers, int pl)
-           : pos(p), mainHistory(mh), lowPlyHistory(lp), captureHistory(cph), continuationHistory(ch),
-             ttMove(ttm), refutations{{killers[0], 0}, {killers[1], 0}, {cm, 0}}, depth(d), ply(pl) {
-
+MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHistory* mh,
+                                                             const CapturePieceToHistory* cph,
+                                                             const PieceToHistory** ch,
+                                                             Move cm,
+                                                             const Move* killers)
+           : pos(p), mainHistory(mh), captureHistory(cph), continuationHistory(ch),
+             ttMove(ttm), refutations{{killers[0], 0}, {killers[1], 0}, {cm, 0}}, depth(d)
+{
  assert(d > 0);

  stage = (pos.checkers() ? EVASION_TT : MAIN_TT) +
@ -69,9 +73,11 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist

 /// MovePicker constructor for quiescence search
 MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHistory* mh,
-                       const CapturePieceToHistory* cph, const PieceToHistory** ch, Square rs)
-           : pos(p), mainHistory(mh), captureHistory(cph), continuationHistory(ch), ttMove(ttm), recaptureSquare(rs), depth(d) {
-
+                                                             const CapturePieceToHistory* cph,
+                                                             const PieceToHistory** ch,
+                                                             Square rs)
+           : pos(p), mainHistory(mh), captureHistory(cph), continuationHistory(ch), ttMove(ttm), recaptureSquare(rs), depth(d)
+{
  assert(d <= 0);

  stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) +
@ -82,9 +88,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist

 /// MovePicker constructor for ProbCut: we generate captures with SEE greater
 /// than or equal to the given threshold.
-MovePicker::MovePicker(const Position& p, Move ttm, Value th, const CapturePieceToHistory* cph)
-           : pos(p), captureHistory(cph), ttMove(ttm), threshold(th) {
-
+MovePicker::MovePicker(const Position& p, Move ttm, Value th, Depth d, const CapturePieceToHistory* cph)
+           : pos(p), captureHistory(cph), ttMove(ttm), threshold(th), depth(d)
+{
  assert(!pos.checkers());

  stage = PROBCUT_TT + !(ttm && pos.capture(ttm)
@ -100,10 +106,35 @@ void MovePicker::score() {

  static_assert(Type == CAPTURES || Type == QUIETS || Type == EVASIONS, "Wrong type");

+  Bitboard threatened, threatenedByPawn, threatenedByMinor, threatenedByRook;
+  if constexpr (Type == QUIETS)
+  {
+      Color us = pos.side_to_move();
+      // squares threatened by pawns
+      threatenedByPawn  = pos.attacks_by<PAWN>(~us);
+      // squares threatened by minors or pawns
+      threatenedByMinor = pos.attacks_by<KNIGHT>(~us) | pos.attacks_by<BISHOP>(~us) | threatenedByPawn;
+      // squares threatened by rooks, minors or pawns
+      threatenedByRook  = pos.attacks_by<ROOK>(~us) | threatenedByMinor;
+
+      // pieces threatened by pieces of lesser material value
+      threatened =  (pos.pieces(us, QUEEN) & threatenedByRook)
+                  | (pos.pieces(us, ROOK)  & threatenedByMinor)
+                  | (pos.pieces(us, KNIGHT, BISHOP) & threatenedByPawn);
+  }
+  else
+  {
+      // Silence unused variable warnings
+      (void) threatened;
+      (void) threatenedByPawn;
+      (void) threatenedByMinor;
+      (void) threatenedByRook;
+  }
+
  for (auto& m : *this)
      if constexpr (Type == CAPTURES)
-          m.value =  int(PieceValue[MG][pos.piece_on(to_sq(m))]) * 6
-                   + (*captureHistory)[pos.moved_piece(m)][to_sq(m)][type_of(pos.piece_on(to_sq(m)))];
+          m.value =  6 * int(PieceValue[MG][pos.piece_on(to_sq(m))])
+                   +     (*captureHistory)[pos.moved_piece(m)][to_sq(m)][type_of(pos.piece_on(to_sq(m)))];

      else if constexpr (Type == QUIETS)
          m.value =      (*mainHistory)[pos.side_to_move()][from_to(m)]
@ -111,7 +142,12 @@ void MovePicker::score() {
                   +     (*continuationHistory[1])[pos.moved_piece(m)][to_sq(m)]
                   +     (*continuationHistory[3])[pos.moved_piece(m)][to_sq(m)]
                   +     (*continuationHistory[5])[pos.moved_piece(m)][to_sq(m)]
-                   + (ply < MAX_LPH ? std::min(4, depth / 3) * (*lowPlyHistory)[ply][from_to(m)] : 0);
+                   +     (threatened & from_sq(m) ?
+                           (type_of(pos.moved_piece(m)) == QUEEN && !(to_sq(m) & threatenedByRook)  ? 50000
+                          : type_of(pos.moved_piece(m)) == ROOK  && !(to_sq(m) & threatenedByMinor) ? 25000
+                          :                                         !(to_sq(m) & threatenedByPawn)  ? 15000
+                          :                                                                           0)
+                          :                                                                           0);

      else // Type == EVASIONS
      {
@ -165,11 +201,12 @@ top:
      endMoves = generate<CAPTURES>(pos, cur);

      score<CAPTURES>();
+      partial_insertion_sort(cur, endMoves, -3000 * depth);
      ++stage;
      goto top;

  case GOOD_CAPTURE:
-      if (select<Best>([&](){
+      if (select<Next>([&](){
                       return pos.see_ge(*cur, Value(-69 * cur->value / 1024)) ?
                              // Move losing capture to endBadCaptures to be tried later
                              true : (*endBadCaptures++ = *cur, false); }))
@ -237,10 +274,10 @@ top:
      return select<Best>([](){ return true; });

  case PROBCUT:
-      return select<Best>([&](){ return pos.see_ge(*cur, threshold); });
+      return select<Next>([&](){ return pos.see_ge(*cur, threshold); });

  case QCAPTURE:
-      if (select<Best>([&](){ return   depth > DEPTH_QS_RECAPTURES
+      if (select<Next>([&](){ return   depth > DEPTH_QS_RECAPTURES
                                    || to_sq(*cur) == recaptureSquare; }))
          return *(cur - 1);

--- a/DroidFishApp/src/main/cpp/stockfish/movepick.h
+++ b/DroidFishApp/src/main/cpp/stockfish/movepick.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -86,13 +86,7 @@ enum StatsType { NoCaptures, Captures };
 /// unsuccessful during the current search, and is used for reduction and move
 /// ordering decisions. It uses 2 tables (one for each color) indexed by
 /// the move's from and to squares, see www.chessprogramming.org/Butterfly_Boards
-typedef Stats<int16_t, 13365, COLOR_NB, int(SQUARE_NB) * int(SQUARE_NB)> ButterflyHistory;
-
-/// At higher depths LowPlyHistory records successful quiet moves near the root
-/// and quiet moves which are/were in the PV (ttPv). It is cleared with each new
-/// search and filled during iterative deepening.
-constexpr int MAX_LPH = 4;
-typedef Stats<int16_t, 10692, MAX_LPH, int(SQUARE_NB) * int(SQUARE_NB)> LowPlyHistory;
+typedef Stats<int16_t, 14365, COLOR_NB, int(SQUARE_NB) * int(SQUARE_NB)> ButterflyHistory;

 /// CounterMoveHistory stores counter moves indexed by [piece][to] of the previous
 /// move, see www.chessprogramming.org/Countermove_Heuristic
@ -123,18 +117,16 @@ class MovePicker {
 public:
  MovePicker(const MovePicker&) = delete;
  MovePicker& operator=(const MovePicker&) = delete;
-  MovePicker(const Position&, Move, Value, const CapturePieceToHistory*);
+  MovePicker(const Position&, Move, Depth, const ButterflyHistory*,
+                                           const CapturePieceToHistory*,
+                                           const PieceToHistory**,
+                                           Move,
+                                           const Move*);
  MovePicker(const Position&, Move, Depth, const ButterflyHistory*,
                                           const CapturePieceToHistory*,
                                           const PieceToHistory**,
                                           Square);
-  MovePicker(const Position&, Move, Depth, const ButterflyHistory*,
-                                           const LowPlyHistory*,
-                                           const CapturePieceToHistory*,
-                                           const PieceToHistory**,
-                                           Move,
-                                           const Move*,
-                                           int);
+  MovePicker(const Position&, Move, Value, Depth, const CapturePieceToHistory*);
  Move next_move(bool skipQuiets = false);

 private:
@ -145,7 +137,6 @@ private:

  const Position& pos;
  const ButterflyHistory* mainHistory;
-  const LowPlyHistory* lowPlyHistory;
  const CapturePieceToHistory* captureHistory;
  const PieceToHistory** continuationHistory;
  Move ttMove;
@ -154,7 +145,6 @@ private:
  Square recaptureSquare;
  Value threshold;
  Depth depth;
-  int ply;
  ExtMove moves[MAX_MOVES];
 };

--- a/DroidFishApp/src/main/cpp/stockfish/nnue/evaluate_nnue.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/evaluate_nnue.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -109,7 +109,7 @@ namespace Stockfish::Eval::NNUE {
  {
    write_little_endian<std::uint32_t>(stream, Version);
    write_little_endian<std::uint32_t>(stream, hashValue);
-    write_little_endian<std::uint32_t>(stream, desc.size());
+    write_little_endian<std::uint32_t>(stream, (std::uint32_t)desc.size());
    stream.write(&desc[0], desc.size());
    return !stream.fail();
  }
@ -143,39 +143,29 @@ namespace Stockfish::Eval::NNUE {
    // overaligning stack variables with alignas() doesn't work correctly.

    constexpr uint64_t alignment = CacheLineSize;
+    int delta = 10 - pos.non_pawn_material() / 1515;

 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
    TransformedFeatureType transformedFeaturesUnaligned[
      FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)];
-    char bufferUnaligned[Network::BufferSize + alignment];

    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
-    auto* buffer = align_ptr_up<alignment>(&bufferUnaligned[0]);
 #else
    alignas(alignment)
      TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
-    alignas(alignment) char buffer[Network::BufferSize];
 #endif

    ASSERT_ALIGNED(transformedFeatures, alignment);
-    ASSERT_ALIGNED(buffer, alignment);

-    const std::size_t bucket = (pos.count<ALL_PIECES>() - 1) / 4;
+    const int bucket = (pos.count<ALL_PIECES>() - 1) / 4;
    const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket);
-    const auto output = network[bucket]->propagate(transformedFeatures, buffer);
+    const auto positional = network[bucket]->propagate(transformedFeatures);

-    int materialist = psqt;
-    int positional  = output[0];
-
-    int delta_npm = abs(pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK));
-    int entertainment = (adjusted && delta_npm <= BishopValueMg - KnightValueMg ? 7 : 0);
-
-    int A = 128 - entertainment;
-    int B = 128 + entertainment;
-
-    int sum = (A * materialist + B * positional) / 128;
-
-    return static_cast<Value>( sum / OutputScale );
+    // Give more value to positional evaluation when adjusted flag is set
+    if (adjusted)
+        return static_cast<Value>(((128 - delta) * psqt + (128 + delta) * positional) / 128 / OutputScale);
+    else
+        return static_cast<Value>((psqt + positional) / OutputScale);
  }

  struct NnueEvalTrace {
@ -196,27 +186,20 @@ namespace Stockfish::Eval::NNUE {
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
    TransformedFeatureType transformedFeaturesUnaligned[
      FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)];
-    char bufferUnaligned[Network::BufferSize + alignment];

    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
-    auto* buffer = align_ptr_up<alignment>(&bufferUnaligned[0]);
 #else
    alignas(alignment)
      TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
-    alignas(alignment) char buffer[Network::BufferSize];
 #endif

    ASSERT_ALIGNED(transformedFeatures, alignment);
-    ASSERT_ALIGNED(buffer, alignment);

    NnueEvalTrace t{};
    t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
-    for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) {
-      const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket);
-      const auto output = network[bucket]->propagate(transformedFeatures, buffer);
-
-      int materialist = psqt;
-      int positional  = output[0];
+    for (IndexType bucket = 0; bucket < LayerStacks; ++bucket) {
+      const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket);
+      const auto positional = network[bucket]->propagate(transformedFeatures);

      t.psqt[bucket] = static_cast<Value>( materialist / OutputScale );
      t.positional[bucket] = static_cast<Value>( positional / OutputScale );
@ -227,69 +210,46 @@ namespace Stockfish::Eval::NNUE {

  static const std::string PieceToChar(" PNBRQK  pnbrqk");

-  // Requires the buffer to have capacity for at least 5 values
+
+  // format_cp_compact() converts a Value into (centi)pawns and writes it in a buffer.
+  // The buffer must have capacity for at least 5 chars.
  static void format_cp_compact(Value v, char* buffer) {

    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');

    int cp = std::abs(100 * v / PawnValueEg);
-
    if (cp >= 10000)
    {
-      buffer[1] = '0' + cp / 10000; cp %= 10000;
-      buffer[2] = '0' + cp / 1000; cp %= 1000;
-      buffer[3] = '0' + cp / 100; cp %= 100;
-      buffer[4] = ' ';
+        buffer[1] = '0' + cp / 10000; cp %= 10000;
+        buffer[2] = '0' + cp / 1000; cp %= 1000;
+        buffer[3] = '0' + cp / 100;
+        buffer[4] = ' ';
    }
    else if (cp >= 1000)
    {
-      buffer[1] = '0' + cp / 1000; cp %= 1000;
-      buffer[2] = '0' + cp / 100; cp %= 100;
-      buffer[3] = '.';
-      buffer[4] = '0' + cp / 10;
+        buffer[1] = '0' + cp / 1000; cp %= 1000;
+        buffer[2] = '0' + cp / 100; cp %= 100;
+        buffer[3] = '.';
+        buffer[4] = '0' + cp / 10;
    }
    else
    {
-      buffer[1] = '0' + cp / 100; cp %= 100;
-      buffer[2] = '.';
-      buffer[3] = '0' + cp / 10; cp %= 10;
-      buffer[4] = '0' + cp / 1;
+        buffer[1] = '0' + cp / 100; cp %= 100;
+        buffer[2] = '.';
+        buffer[3] = '0' + cp / 10; cp %= 10;
+        buffer[4] = '0' + cp / 1;
    }
  }

-  // Requires the buffer to have capacity for at least 7 values
+
+  // format_cp_aligned_dot() converts a Value into (centi)pawns and writes it in a buffer,
+  // always keeping two decimals. The buffer must have capacity for at least 7 chars.
  static void format_cp_aligned_dot(Value v, char* buffer) {
+
    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');

-    int cp = std::abs(100 * v / PawnValueEg);
-
-    if (cp >= 10000)
-    {
-      buffer[1] = '0' + cp / 10000; cp %= 10000;
-      buffer[2] = '0' + cp / 1000; cp %= 1000;
-      buffer[3] = '0' + cp / 100; cp %= 100;
-      buffer[4] = '.';
-      buffer[5] = '0' + cp / 10; cp %= 10;
-      buffer[6] = '0' + cp;
-    }
-    else if (cp >= 1000)
-    {
-      buffer[1] = ' ';
-      buffer[2] = '0' + cp / 1000; cp %= 1000;
-      buffer[3] = '0' + cp / 100; cp %= 100;
-      buffer[4] = '.';
-      buffer[5] = '0' + cp / 10; cp %= 10;
-      buffer[6] = '0' + cp;
-    }
-    else
-    {
-      buffer[1] = ' ';
-      buffer[2] = ' ';
-      buffer[3] = '0' + cp / 100; cp %= 100;
-      buffer[4] = '.';
-      buffer[5] = '0' + cp / 10; cp %= 10;
-      buffer[6] = '0' + cp / 1;
-    }
+    double cp = 1.0 * std::abs(int(v)) / PawnValueEg;
+    sprintf(&buffer[1], "%6.2f", cp);
  }


@ -419,7 +379,7 @@ namespace Stockfish::Eval::NNUE {
        actualFilename = filename.value();
    else
    {
-        if (eval_file_loaded != EvalFileDefaultName)
+        if (currentEvalFileName != EvalFileDefaultName)
        {
             msg = "Failed to export a net. A non-embedded net can only be saved if the filename is specified";

--- a/DroidFishApp/src/main/cpp/stockfish/nnue/evaluate_nnue.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/evaluate_nnue.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/features/half_ka_v2_hm.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/features/half_ka_v2_hm.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -16,31 +16,32 @@
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-//Definition of input features HalfKAv2 of NNUE evaluation function
+//Definition of input features HalfKAv2_hm of NNUE evaluation function

-#include "half_ka_v2.h"
+#include "half_ka_v2_hm.h"

 #include "../../position.h"

 namespace Stockfish::Eval::NNUE::Features {

  // Orient a square according to perspective (rotates by 180 for black)
-  inline Square HalfKAv2::orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 56));
+  inline Square HalfKAv2_hm::orient(Color perspective, Square s, Square ksq) {
+    return Square(int(s) ^ (bool(perspective) * SQ_A8) ^ ((file_of(ksq) < FILE_E) * SQ_H1));
  }

  // Index of a feature for a given king position and another piece on some square
-  inline IndexType HalfKAv2::make_index(Color perspective, Square s, Piece pc, Square ksq) {
-    return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq);
+  inline IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) {
+    Square o_ksq = orient(perspective, ksq, ksq);
+    return IndexType(orient(perspective, s, ksq) + PieceSquareIndex[perspective][pc] + PS_NB * KingBuckets[o_ksq]);
  }

  // Get a list of indices for active features
-  void HalfKAv2::append_active_indices(
+  void HalfKAv2_hm::append_active_indices(
    const Position& pos,
    Color perspective,
-    ValueListInserter<IndexType> active
+    IndexList& active
  ) {
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Square ksq = pos.square<KING>(perspective);
    Bitboard bb = pos.pieces();
    while (bb)
    {
@ -52,33 +53,30 @@ namespace Stockfish::Eval::NNUE::Features {

  // append_changed_indices() : get a list of indices for recently changed features

-  void HalfKAv2::append_changed_indices(
+  void HalfKAv2_hm::append_changed_indices(
    Square ksq,
-    StateInfo* st,
+    const DirtyPiece& dp,
    Color perspective,
-    ValueListInserter<IndexType> removed,
-    ValueListInserter<IndexType> added
+    IndexList& removed,
+    IndexList& added
  ) {
-    const auto& dp = st->dirtyPiece;
-    Square oriented_ksq = orient(perspective, ksq);
    for (int i = 0; i < dp.dirty_num; ++i) {
-      Piece pc = dp.piece[i];
      if (dp.from[i] != SQ_NONE)
-        removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq));
+        removed.push_back(make_index(perspective, dp.from[i], dp.piece[i], ksq));
      if (dp.to[i] != SQ_NONE)
-        added.push_back(make_index(perspective, dp.to[i], pc, oriented_ksq));
+        added.push_back(make_index(perspective, dp.to[i], dp.piece[i], ksq));
    }
  }

-  int HalfKAv2::update_cost(StateInfo* st) {
+  int HalfKAv2_hm::update_cost(const StateInfo* st) {
    return st->dirtyPiece.dirty_num;
  }

-  int HalfKAv2::refresh_cost(const Position& pos) {
+  int HalfKAv2_hm::refresh_cost(const Position& pos) {
    return pos.count<ALL_PIECES>();
  }

-  bool HalfKAv2::requires_refresh(StateInfo* st, Color perspective) {
+  bool HalfKAv2_hm::requires_refresh(const StateInfo* st, Color perspective) {
    return st->dirtyPiece.piece[0] == make_piece(perspective, KING);
  }

--- a/DroidFishApp/src/main/cpp/stockfish/nnue/features/half_ka_v2_hm.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/features/half_ka_v2_hm.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -18,8 +18,8 @@

 //Definition of input features HalfKP of NNUE evaluation function

-#ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
-#define NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
+#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED

 #include "../nnue_common.h"

@ -32,9 +32,9 @@ namespace Stockfish {

 namespace Stockfish::Eval::NNUE::Features {

-  // Feature HalfKAv2: Combination of the position of own king
-  // and the position of pieces
-  class HalfKAv2 {
+  // Feature HalfKAv2_hm: Combination of the position of own king
+  // and the position of pieces. Position mirrored such that king always on e..h files.
+  class HalfKAv2_hm {

    // unique number for each piece type on each square
    enum {
@ -50,7 +50,7 @@ namespace Stockfish::Eval::NNUE::Features {
      PS_W_QUEEN  =  8 * SQUARE_NB,
      PS_B_QUEEN  =  9 * SQUARE_NB,
      PS_KING     =  10 * SQUARE_NB,
-      PS_NB = 11 * SQUARE_NB
+      PS_NB       =  11 * SQUARE_NB
    };

    static constexpr IndexType PieceSquareIndex[COLOR_NB][PIECE_NB] = {
@ -63,49 +63,62 @@ namespace Stockfish::Eval::NNUE::Features {
    };

    // Orient a square according to perspective (rotates by 180 for black)
-    static Square orient(Color perspective, Square s);
+    static Square orient(Color perspective, Square s, Square ksq);

    // Index of a feature for a given king position and another piece on some square
    static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq);

   public:
    // Feature name
-    static constexpr const char* Name = "HalfKAv2(Friend)";
+    static constexpr const char* Name = "HalfKAv2_hm(Friend)";

    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t HashValue = 0x5f234cb8u;
+    static constexpr std::uint32_t HashValue = 0x7f234cb8u;

    // Number of feature dimensions
    static constexpr IndexType Dimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB);
+        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB) / 2;
+
+    static constexpr int KingBuckets[64] = {
+      -1, -1, -1, -1, 31, 30, 29, 28,
+      -1, -1, -1, -1, 27, 26, 25, 24,
+      -1, -1, -1, -1, 23, 22, 21, 20,
+      -1, -1, -1, -1, 19, 18, 17, 16,
+      -1, -1, -1, -1, 15, 14, 13, 12,
+      -1, -1, -1, -1, 11, 10,  9,  8,
+      -1, -1, -1, -1,  7,  6,  5,  4,
+      -1, -1, -1, -1,  3,  2,  1,  0
+    };

    // Maximum number of simultaneously active features.
    static constexpr IndexType MaxActiveDimensions = 32;
+    using IndexList = ValueList<IndexType, MaxActiveDimensions>;

    // Get a list of indices for active features
    static void append_active_indices(
      const Position& pos,
      Color perspective,
-      ValueListInserter<IndexType> active);
+      IndexList& active);

    // Get a list of indices for recently changed features
    static void append_changed_indices(
      Square ksq,
-      StateInfo* st,
+      const DirtyPiece& dp,
      Color perspective,
-      ValueListInserter<IndexType> removed,
-      ValueListInserter<IndexType> added);
+      IndexList& removed,
+      IndexList& added
+    );

    // Returns the cost of updating one perspective, the most costly one.
    // Assumes no refresh needed.
-    static int update_cost(StateInfo* st);
+    static int update_cost(const StateInfo* st);
    static int refresh_cost(const Position& pos);

    // Returns whether the change stored in this StateInfo means that
    // a full accumulator refresh is required.
-    static bool requires_refresh(StateInfo* st, Color perspective);
+    static bool requires_refresh(const StateInfo* st, Color perspective);
  };

 }  // namespace Stockfish::Eval::NNUE::Features

-#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
+#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/layers/affine_transform.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/layers/affine_transform.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -22,398 +22,338 @@
 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED

 #include <iostream>
+#include <algorithm>
+#include <type_traits>
 #include "../nnue_common.h"
+#include "../../simd.h"
+
+/*
+  This file contains the definition for a fully connected layer (aka affine transform).
+  Two approaches are employed, depending on the sizes of the transform.
+
+  Approach 1:
+    - used when the PaddedInputDimensions >= 128
+    - uses AVX512 if possible
+    - processes inputs in batches of 2*InputSimdWidth
+      - so in batches of 128 for AVX512
+    - the weight blocks of size InputSimdWidth are transposed such that
+      access is sequential
+    - N columns of the weight matrix are processed a time, where N
+      depends on the architecture (the amount of registers)
+    - accumulate + hadd is used
+
+  Approach 2:
+    - used when the PaddedInputDimensions < 128
+    - does not use AVX512
+    - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32.
+      - that's why AVX512 is hard to implement
+    - expected use-case is small layers
+      - not optimized as well as the approach 1
+    - inputs are processed in chunks of 4, weights are respectively transposed
+    - accumulation happens directly to int32s
+*/

 namespace Stockfish::Eval::NNUE::Layers {

-  // Affine transformation layer
-  template <typename PreviousLayer, IndexType OutDims>
-  class AffineTransform {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::int32_t;
-    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+// Fallback implementation for older/other architectures.
+// Identical for both approaches. Requires the input to be padded to at least 16 values.
+#if !defined(USE_SSSE3)
+  template <IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
+  static void affine_transform_non_ssse3(std::int32_t* output, const std::int8_t* weights, const std::int32_t* biases, const std::uint8_t* input)
+  {
+# if defined(USE_SSE2)
+    // At least a multiple of 16, with SSE2.
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+    const __m128i Zeros = _mm_setzero_si128();
+    const auto inputVector = reinterpret_cast<const __m128i*>(input);

-    // Number of input/output dimensions
-    static constexpr IndexType InputDimensions =
-        PreviousLayer::OutputDimensions;
-    static constexpr IndexType OutputDimensions = OutDims;
-    static constexpr IndexType PaddedInputDimensions =
-        ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
-#if defined (USE_AVX512)
-    static constexpr const IndexType OutputSimdWidth = SimdWidth / 2;
-#elif defined (USE_SSSE3)
-    static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
+# elif defined(USE_MMX)
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 8;
+    const __m64 Zeros = _mm_setzero_si64();
+    const auto inputVector = reinterpret_cast<const __m64*>(input);
+
+# elif defined(USE_NEON)
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+    const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
+# endif
+
+    for (IndexType i = 0; i < OutputDimensions; ++i) {
+      const IndexType offset = i * PaddedInputDimensions;
+
+# if defined(USE_SSE2)
+      __m128i sumLo = _mm_cvtsi32_si128(biases[i]);
+      __m128i sumHi = Zeros;
+      const auto row = reinterpret_cast<const __m128i*>(&weights[offset]);
+      for (IndexType j = 0; j < NumChunks; ++j) {
+        __m128i row_j = _mm_load_si128(&row[j]);
+        __m128i input_j = _mm_load_si128(&inputVector[j]);
+        __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
+        __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
+        __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
+        __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
+        __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo);
+        __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi);
+        sumLo = _mm_add_epi32(sumLo, productLo);
+        sumHi = _mm_add_epi32(sumHi, productHi);
+      }
+      __m128i sum = _mm_add_epi32(sumLo, sumHi);
+      __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+      sum = _mm_add_epi32(sum, sumHigh_64);
+      __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+      sum = _mm_add_epi32(sum, sum_second_32);
+      output[i] = _mm_cvtsi128_si32(sum);
+
+# elif defined(USE_MMX)
+      __m64 sumLo = _mm_cvtsi32_si64(biases[i]);
+      __m64 sumHi = Zeros;
+      const auto row = reinterpret_cast<const __m64*>(&weights[offset]);
+      for (IndexType j = 0; j < NumChunks; ++j) {
+        __m64 row_j = row[j];
+        __m64 input_j = inputVector[j];
+        __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8);
+        __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8);
+        __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros);
+        __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros);
+        __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo);
+        __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi);
+        sumLo = _mm_add_pi32(sumLo, productLo);
+        sumHi = _mm_add_pi32(sumHi, productHi);
+      }
+      __m64 sum = _mm_add_pi32(sumLo, sumHi);
+      sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+      output[i] = _mm_cvtsi64_si32(sum);
+
+# elif defined(USE_NEON)
+      int32x4_t sum = {biases[i]};
+      const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
+      for (IndexType j = 0; j < NumChunks; ++j) {
+        int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
+        product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
+        sum = vpadalq_s16(sum, product);
+      }
+      output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+
+# else
+      std::int32_t sum = biases[i];
+      for (IndexType j = 0; j < InputDimensions; ++j) {
+        sum += weights[offset + j] * input[j];
+      }
+      output[i] = sum;
+# endif
+    }
+
+# if defined(USE_MMX)
+    _mm_empty();
+# endif
+  }
 #endif

-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t SelfBufferSize =
-        ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
+  template <IndexType InDims, IndexType OutDims, typename Enabled = void>
+  class AffineTransform;

-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t BufferSize =
-        PreviousLayer::BufferSize + SelfBufferSize;
+  // A specialization for large inputs.
+  template <IndexType InDims, IndexType OutDims>
+  class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) >= 2*64)>> {
+   public:
+    // Input/output type
+    using InputType = std::uint8_t;
+    using OutputType = std::int32_t;
+
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions = InDims;
+    static constexpr IndexType OutputDimensions = OutDims;
+
+    static constexpr IndexType PaddedInputDimensions =
+      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
+
+    static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen.");
+
+#if defined (USE_AVX512)
+    static constexpr const IndexType InputSimdWidth = 64;
+    static constexpr const IndexType MaxNumOutputRegs = 16;
+#elif defined (USE_AVX2)
+    static constexpr const IndexType InputSimdWidth = 32;
+    static constexpr const IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_SSSE3)
+    static constexpr const IndexType InputSimdWidth = 16;
+    static constexpr const IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_NEON)
+    static constexpr const IndexType InputSimdWidth = 8;
+    static constexpr const IndexType MaxNumOutputRegs = 8;
+#else
+    // The fallback implementation will not have permuted weights.
+    // We define these to avoid a lot of ifdefs later.
+    static constexpr const IndexType InputSimdWidth = 1;
+    static constexpr const IndexType MaxNumOutputRegs = 1;
+#endif
+
+    // A big block is a region in the weight matrix of the size [PaddedInputDimensions, NumOutputRegs].
+    // A small block is a region of size [InputSimdWidth, 1]
+
+    static constexpr const IndexType NumOutputRegs = std::min(MaxNumOutputRegs, OutputDimensions);
+    static constexpr const IndexType SmallBlockSize = InputSimdWidth;
+    static constexpr const IndexType BigBlockSize = NumOutputRegs * PaddedInputDimensions;
+    static constexpr const IndexType NumSmallBlocksInBigBlock = BigBlockSize / SmallBlockSize;
+    static constexpr const IndexType NumSmallBlocksPerOutput = PaddedInputDimensions / SmallBlockSize;
+    static constexpr const IndexType NumBigBlocks = OutputDimensions / NumOutputRegs;
+
+    static_assert(OutputDimensions % NumOutputRegs == 0);

    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value() {
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
      std::uint32_t hashValue = 0xCC03DAE4u;
      hashValue += OutputDimensions;
-      hashValue ^= PreviousLayer::get_hash_value() >> 1;
-      hashValue ^= PreviousLayer::get_hash_value() << 31;
+      hashValue ^= prevHash >> 1;
+      hashValue ^= prevHash << 31;
      return hashValue;
    }

+    /*
+      Transposes the small blocks within a block.
+      Effectively means that weights can be traversed sequentially during inference.
+    */
+    static IndexType get_weight_index(IndexType i)
+    {
+      const IndexType smallBlock = (i / SmallBlockSize) % NumSmallBlocksInBigBlock;
+      const IndexType smallBlockCol = smallBlock / NumSmallBlocksPerOutput;
+      const IndexType smallBlockRow = smallBlock % NumSmallBlocksPerOutput;
+      const IndexType bigBlock   = i / BigBlockSize;
+      const IndexType rest       = i % SmallBlockSize;
+
+      const IndexType idx =
+          bigBlock * BigBlockSize
+        + smallBlockRow * SmallBlockSize * NumOutputRegs
+        + smallBlockCol * SmallBlockSize
+        + rest;
+
+      return idx;
+    }
+
    // Read network parameters
    bool read_parameters(std::istream& stream) {
-      if (!previousLayer.read_parameters(stream)) return false;
-      for (std::size_t i = 0; i < OutputDimensions; ++i)
+      for (IndexType i = 0; i < OutputDimensions; ++i)
        biases[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
-#if !defined (USE_SSSE3)
-        weights[i] = read_little_endian<WeightType>(stream);
-#else
-        weights[
-          (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
-          i / PaddedInputDimensions * 4 +
-          i % 4
-        ] = read_little_endian<WeightType>(stream);
-#endif
+
+      for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+        weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);

      return !stream.fail();
    }

    // Write network parameters
    bool write_parameters(std::ostream& stream) const {
-      if (!previousLayer.write_parameters(stream)) return false;
-      for (std::size_t i = 0; i < OutputDimensions; ++i)
+      for (IndexType i = 0; i < OutputDimensions; ++i)
          write_little_endian<BiasType>(stream, biases[i]);
-#if !defined (USE_SSSE3)
-      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
-          write_little_endian<WeightType>(stream, weights[i]);
-#else
-      std::unique_ptr<WeightType[]> unscrambledWeights = std::make_unique<WeightType[]>(OutputDimensions * PaddedInputDimensions);
-      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) {
-          unscrambledWeights[i] =
-              weights[
-                (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
-                i / PaddedInputDimensions * 4 +
-                i % 4
-              ];
-      }

-      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
-          write_little_endian<WeightType>(stream, unscrambledWeights[i]);
-#endif
+      for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+        write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);

      return !stream.fail();
    }

    // Forward propagation
    const OutputType* propagate(
-        const TransformedFeatureType* transformedFeatures, char* buffer) const {
-      const auto input = previousLayer.propagate(
-          transformedFeatures, buffer + SelfBufferSize);
+        const InputType* input, OutputType* output) const {

 #if defined (USE_AVX512)
-
-      [[maybe_unused]] const __m512i Ones512 = _mm512_set1_epi16(1);
-
-      [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
-        return _mm512_reduce_add_epi32(sum) + bias;
-      };
-
-      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
-#if defined (USE_VNNI)
-        acc = _mm512_dpbusd_epi32(acc, a, b);
-#else
-        __m512i product0 = _mm512_maddubs_epi16(a, b);
-        product0 = _mm512_madd_epi16(product0, Ones512);
-        acc = _mm512_add_epi32(acc, product0);
-#endif
-      };
-
-      [[maybe_unused]] auto m512_add_dpbusd_epi32x4 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1,
-                                                                        __m512i a2, __m512i b2, __m512i a3, __m512i b3) {
-#if defined (USE_VNNI)
-        acc = _mm512_dpbusd_epi32(acc, a0, b0);
-        acc = _mm512_dpbusd_epi32(acc, a1, b1);
-        acc = _mm512_dpbusd_epi32(acc, a2, b2);
-        acc = _mm512_dpbusd_epi32(acc, a3, b3);
-#else
-        __m512i product0 = _mm512_maddubs_epi16(a0, b0);
-        __m512i product1 = _mm512_maddubs_epi16(a1, b1);
-        __m512i product2 = _mm512_maddubs_epi16(a2, b2);
-        __m512i product3 = _mm512_maddubs_epi16(a3, b3);
-        product0 = _mm512_adds_epi16(product0, product1);
-        product0 = _mm512_madd_epi16(product0, Ones512);
-        product2 = _mm512_adds_epi16(product2, product3);
-        product2 = _mm512_madd_epi16(product2, Ones512);
-        acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product2));
-#endif
-      };
-
-#endif
-#if defined (USE_AVX2)
-
-      [[maybe_unused]] const __m256i Ones256 = _mm256_set1_epi16(1);
-
-      [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        return _mm_cvtsi128_si32(sum128) + bias;
-      };
-
-      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
-#if defined (USE_VNNI)
-        acc = _mm256_dpbusd_epi32(acc, a, b);
-#else
-        __m256i product0 = _mm256_maddubs_epi16(a, b);
-        product0 = _mm256_madd_epi16(product0, Ones256);
-        acc = _mm256_add_epi32(acc, product0);
-#endif
-      };
-
-      [[maybe_unused]] auto m256_add_dpbusd_epi32x4 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1,
-                                                                        __m256i a2, __m256i b2, __m256i a3, __m256i b3) {
-#if defined (USE_VNNI)
-        acc = _mm256_dpbusd_epi32(acc, a0, b0);
-        acc = _mm256_dpbusd_epi32(acc, a1, b1);
-        acc = _mm256_dpbusd_epi32(acc, a2, b2);
-        acc = _mm256_dpbusd_epi32(acc, a3, b3);
-#else
-        __m256i product0 = _mm256_maddubs_epi16(a0, b0);
-        __m256i product1 = _mm256_maddubs_epi16(a1, b1);
-        __m256i product2 = _mm256_maddubs_epi16(a2, b2);
-        __m256i product3 = _mm256_maddubs_epi16(a3, b3);
-        product0 = _mm256_adds_epi16(product0, product1);
-        product0 = _mm256_madd_epi16(product0, Ones256);
-        product2 = _mm256_adds_epi16(product2, product3);
-        product2 = _mm256_madd_epi16(product2, Ones256);
-        acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product2));
-#endif
-      };
-
-#endif
-#if defined (USE_SSSE3)
-
-      [[maybe_unused]] const __m128i Ones128 = _mm_set1_epi16(1);
-
-      [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        return _mm_cvtsi128_si32(sum) + bias;
-      };
-
-      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
-        __m128i product0 = _mm_maddubs_epi16(a, b);
-        product0 = _mm_madd_epi16(product0, Ones128);
-        acc = _mm_add_epi32(acc, product0);
-      };
-
-      [[maybe_unused]] auto m128_add_dpbusd_epi32x4 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1,
-                                                                        __m128i a2, __m128i b2, __m128i a3, __m128i b3) {
-        __m128i product0 = _mm_maddubs_epi16(a0, b0);
-        __m128i product1 = _mm_maddubs_epi16(a1, b1);
-        __m128i product2 = _mm_maddubs_epi16(a2, b2);
-        __m128i product3 = _mm_maddubs_epi16(a3, b3);
-        product0 = _mm_adds_epi16(product0, product1);
-        product0 = _mm_madd_epi16(product0, Ones128);
-        product2 = _mm_adds_epi16(product2, product3);
-        product2 = _mm_madd_epi16(product2, Ones128);
-        acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product2));
-      };
-
-#endif
-
-#if defined (USE_AVX512)
-      using vec_t = __m512i;
-      #define vec_setzero _mm512_setzero_si512
-      #define vec_set_32 _mm512_set1_epi32
-      auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32;
-      auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4;
-      auto& vec_hadd = m512_hadd;
+      using acc_vec_t = __m512i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m512i;
+      using in_vec_t = __m512i;
+      #define vec_zero _mm512_setzero_si512()
+      #define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
+      #define vec_hadd Simd::m512_hadd
+      #define vec_haddx4 Simd::m512_haddx4
 #elif defined (USE_AVX2)
-      using vec_t = __m256i;
-      #define vec_setzero _mm256_setzero_si256
-      #define vec_set_32 _mm256_set1_epi32
-      auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32;
-      auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4;
-      auto& vec_hadd = m256_hadd;
+      using acc_vec_t = __m256i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m256i;
+      using in_vec_t = __m256i;
+      #define vec_zero _mm256_setzero_si256()
+      #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
+      #define vec_hadd Simd::m256_hadd
+      #define vec_haddx4 Simd::m256_haddx4
 #elif defined (USE_SSSE3)
-      using vec_t = __m128i;
-      #define vec_setzero _mm_setzero_si128
-      #define vec_set_32 _mm_set1_epi32
-      auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32;
-      auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4;
-      auto& vec_hadd = m128_hadd;
+      using acc_vec_t = __m128i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m128i;
+      using in_vec_t = __m128i;
+      #define vec_zero _mm_setzero_si128()
+      #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
+      #define vec_hadd Simd::m128_hadd
+      #define vec_haddx4 Simd::m128_haddx4
+#elif defined (USE_NEON)
+      using acc_vec_t = int32x4_t;
+      using bias_vec_t = int32x4_t;
+      using weight_vec_t = int8x8_t;
+      using in_vec_t = int8x8_t;
+      #define vec_zero {0}
+      #define vec_add_dpbusd_32x2 Simd::neon_m128_add_dpbusd_epi32x2
+      #define vec_hadd Simd::neon_m128_hadd
+      #define vec_haddx4 Simd::neon_m128_haddx4
 #endif

-#if defined (USE_SSSE3)
-      // Different layout, we process 4 inputs at a time, always.
-      static_assert(InputDimensions % 4 == 0);
+#if defined (USE_SSSE3) || defined (USE_NEON)
+      const in_vec_t* invec = reinterpret_cast<const in_vec_t*>(input);

-      const auto output = reinterpret_cast<OutputType*>(buffer);
-      const auto inputVector = reinterpret_cast<const vec_t*>(input);
-
-      static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
-
-      // OutputDimensions is either 1 or a multiple of SimdWidth
-      // because then it is also an input dimension.
-      if constexpr (OutputDimensions % OutputSimdWidth == 0)
+      // Perform accumulation to registers for each big block
+      for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock)
      {
-          constexpr IndexType NumChunks = InputDimensions / 4;
+        acc_vec_t acc[NumOutputRegs] = { vec_zero };

-          const auto input32 = reinterpret_cast<const std::int32_t*>(input);
-          vec_t* outptr = reinterpret_cast<vec_t*>(output);
-          std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
+        // Each big block has NumOutputRegs small blocks in each "row", one per register.
+        // We process two small blocks at a time to save on one addition without VNNI.
+        for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2)
+        {
+          const weight_vec_t* weightvec =
+            reinterpret_cast<const weight_vec_t*>(
+                weights
+              + bigBlock * BigBlockSize
+              + smallBlock * SmallBlockSize * NumOutputRegs);

-          for (int i = 0; i < (int)NumChunks - 3; i += 4)
+          const in_vec_t in0 = invec[smallBlock + 0];
+          const in_vec_t in1 = invec[smallBlock + 1];
+
+          for (IndexType k = 0; k < NumOutputRegs; ++k)
+            vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]);
+        }
+
+        // Horizontally add all accumulators.
+        if constexpr (NumOutputRegs % 4 == 0)
+        {
+          bias_vec_t* outputvec = reinterpret_cast<bias_vec_t*>(output);
+          const bias_vec_t* biasvec = reinterpret_cast<const bias_vec_t*>(biases);
+
+          for (IndexType k = 0; k < NumOutputRegs; k += 4)
          {
-              const vec_t in0 = vec_set_32(input32[i + 0]);
-              const vec_t in1 = vec_set_32(input32[i + 1]);
-              const vec_t in2 = vec_set_32(input32[i + 2]);
-              const vec_t in3 = vec_set_32(input32[i + 3]);
-              const auto col0 = reinterpret_cast<const vec_t*>(&weights[(i + 0) * OutputDimensions * 4]);
-              const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
-              const auto col2 = reinterpret_cast<const vec_t*>(&weights[(i + 2) * OutputDimensions * 4]);
-              const auto col3 = reinterpret_cast<const vec_t*>(&weights[(i + 3) * OutputDimensions * 4]);
-              for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
-                  vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
+            const IndexType idx = (bigBlock * NumOutputRegs + k) / 4;
+            outputvec[idx] = vec_haddx4(acc[k+0], acc[k+1], acc[k+2], acc[k+3], biasvec[idx]);
          }
-      }
-      else if constexpr (OutputDimensions == 1)
-      {
-#if defined (USE_AVX512)
-          if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0)
+        }
+        else
+        {
+          for (IndexType k = 0; k < NumOutputRegs; ++k)
          {
-              constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
-              const auto inputVector256 = reinterpret_cast<const __m256i*>(input);
-
-              __m256i sum0 = _mm256_setzero_si256();
-              const auto row0 = reinterpret_cast<const __m256i*>(&weights[0]);
-
-              for (int j = 0; j < (int)NumChunks; ++j)
-              {
-                  const __m256i in = inputVector256[j];
-                  m256_add_dpbusd_epi32(sum0, in, row0[j]);
-              }
-              output[0] = m256_hadd(sum0, biases[0]);
-          }
-          else
-#endif
-          {
-#if defined (USE_AVX512)
-              constexpr IndexType NumChunks = PaddedInputDimensions / (SimdWidth * 2);
-#else
-              constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
-#endif
-              vec_t sum0 = vec_setzero();
-              const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
-
-              for (int j = 0; j < (int)NumChunks; ++j)
-              {
-                  const vec_t in = inputVector[j];
-                  vec_add_dpbusd_32(sum0, in, row0[j]);
-              }
-              output[0] = vec_hadd(sum0, biases[0]);
+            const IndexType idx = (bigBlock * NumOutputRegs + k);
+            output[idx] = vec_hadd(acc[k], biases[idx]);
          }
+        }
      }

+# undef vec_zero
+# undef vec_add_dpbusd_32x2
+# undef vec_hadd
+# undef vec_haddx4
 #else
-
-// Use old implementation for the other architectures.
-
-      auto output = reinterpret_cast<OutputType*>(buffer);
-
-#if defined(USE_SSE2)
-      // At least a multiple of 16, with SSE2.
-      static_assert(InputDimensions % SimdWidth == 0);
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-      const __m128i Zeros = _mm_setzero_si128();
-      const auto inputVector = reinterpret_cast<const __m128i*>(input);
-
-#elif defined(USE_MMX)
-      static_assert(InputDimensions % SimdWidth == 0);
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-      const __m64 Zeros = _mm_setzero_si64();
-      const auto inputVector = reinterpret_cast<const __m64*>(input);
-
-#elif defined(USE_NEON)
-      static_assert(InputDimensions % SimdWidth == 0);
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-      const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
-#endif
-
-      for (IndexType i = 0; i < OutputDimensions; ++i) {
-        const IndexType offset = i * PaddedInputDimensions;
-
-#if defined(USE_SSE2)
-        __m128i sumLo = _mm_cvtsi32_si128(biases[i]);
-        __m128i sumHi = Zeros;
-        const auto row = reinterpret_cast<const __m128i*>(&weights[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          __m128i row_j = _mm_load_si128(&row[j]);
-          __m128i input_j = _mm_load_si128(&inputVector[j]);
-          __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
-          __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
-          __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
-          __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
-          __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo);
-          __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi);
-          sumLo = _mm_add_epi32(sumLo, productLo);
-          sumHi = _mm_add_epi32(sumHi, productHi);
-        }
-        __m128i sum = _mm_add_epi32(sumLo, sumHi);
-        __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sumHigh_64);
-        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_second_32);
-        output[i] = _mm_cvtsi128_si32(sum);
-
-#elif defined(USE_MMX)
-        __m64 sumLo = _mm_cvtsi32_si64(biases[i]);
-        __m64 sumHi = Zeros;
-        const auto row = reinterpret_cast<const __m64*>(&weights[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          __m64 row_j = row[j];
-          __m64 input_j = inputVector[j];
-          __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8);
-          __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8);
-          __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros);
-          __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros);
-          __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo);
-          __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi);
-          sumLo = _mm_add_pi32(sumLo, productLo);
-          sumHi = _mm_add_pi32(sumHi, productHi);
-        }
-        __m64 sum = _mm_add_pi32(sumLo, sumHi);
-        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
-        output[i] = _mm_cvtsi64_si32(sum);
-
-#elif defined(USE_NEON)
-        int32x4_t sum = {biases[i]};
-        const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
-          product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
-          sum = vpadalq_s16(sum, product);
-        }
-        output[i] = sum[0] + sum[1] + sum[2] + sum[3];
-
-#else
-        OutputType sum = biases[i];
-        for (IndexType j = 0; j < InputDimensions; ++j) {
-          sum += weights[offset + j] * input[j];
-        }
-        output[i] = sum;
-#endif
-
-      }
-#if defined(USE_MMX)
-      _mm_empty();
-#endif
+      // Use old implementation for the other architectures.
+      affine_transform_non_ssse3<
+        InputDimensions,
+        PaddedInputDimensions,
+        OutputDimensions>(output, weights, biases, input);

 #endif

@ -424,7 +364,171 @@ namespace Stockfish::Eval::NNUE::Layers {
    using BiasType = OutputType;
    using WeightType = std::int8_t;

-    PreviousLayer previousLayer;
+    alignas(CacheLineSize) BiasType biases[OutputDimensions];
+    alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
+  };
+
+  template <IndexType InDims, IndexType OutDims>
+  class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) < 2*64)>> {
+   public:
+    // Input/output type
+    // Input/output type
+    using InputType = std::uint8_t;
+    using OutputType = std::int32_t;
+
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions = InDims;
+    static constexpr IndexType OutputDimensions = OutDims;
+
+    static constexpr IndexType PaddedInputDimensions =
+      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
+
+    static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen.");
+
+#if defined (USE_SSSE3)
+    static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
+    static constexpr const IndexType InputSimdWidth = SimdWidth;
+#endif
+
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+      std::uint32_t hashValue = 0xCC03DAE4u;
+      hashValue += OutputDimensions;
+      hashValue ^= prevHash >> 1;
+      hashValue ^= prevHash << 31;
+      return hashValue;
+    }
+
+    static IndexType get_weight_index_scrambled(IndexType i)
+    {
+      return
+        (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
+        i / PaddedInputDimensions * 4 +
+        i % 4;
+    }
+
+    static IndexType get_weight_index(IndexType i)
+    {
+#if defined (USE_SSSE3)
+      return get_weight_index_scrambled(i);
+#else
+      return i;
+#endif
+    }
+
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+      for (IndexType i = 0; i < OutputDimensions; ++i)
+        biases[i] = read_little_endian<BiasType>(stream);
+      for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+        weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
+
+      return !stream.fail();
+    }
+
+    // Write network parameters
+    bool write_parameters(std::ostream& stream) const {
+      for (IndexType i = 0; i < OutputDimensions; ++i)
+        write_little_endian<BiasType>(stream, biases[i]);
+
+      for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+        write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
+
+      return !stream.fail();
+    }
+    // Forward propagation
+    const OutputType* propagate(
+        const InputType* input, OutputType* output) const {
+
+#if defined (USE_AVX2)
+      using vec_t = __m256i;
+      #define vec_setzero _mm256_setzero_si256
+      #define vec_set_32 _mm256_set1_epi32
+      #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
+      #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
+      #define vec_add_dpbusd_32x4 Simd::m256_add_dpbusd_epi32x4
+      #define vec_hadd Simd::m256_hadd
+      #define vec_haddx4 Simd::m256_haddx4
+#elif defined (USE_SSSE3)
+      using vec_t = __m128i;
+      #define vec_setzero _mm_setzero_si128
+      #define vec_set_32 _mm_set1_epi32
+      #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
+      #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
+      #define vec_add_dpbusd_32x4 Simd::m128_add_dpbusd_epi32x4
+      #define vec_hadd Simd::m128_hadd
+      #define vec_haddx4 Simd::m128_haddx4
+#endif
+
+#if defined (USE_SSSE3)
+      const auto inputVector = reinterpret_cast<const vec_t*>(input);
+
+      static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
+
+      if constexpr (OutputDimensions % OutputSimdWidth == 0)
+      {
+        constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
+        constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
+
+        const auto input32 = reinterpret_cast<const std::int32_t*>(input);
+        const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
+        vec_t acc[NumRegs];
+        for (IndexType k = 0; k < NumRegs; ++k)
+          acc[k] = biasvec[k];
+
+        for (IndexType i = 0; i < NumChunks; i += 2)
+        {
+          const vec_t in0 = vec_set_32(input32[i + 0]);
+          const vec_t in1 = vec_set_32(input32[i + 1]);
+          const auto col0 = reinterpret_cast<const vec_t*>(&weights[(i + 0) * OutputDimensions * 4]);
+          const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
+          for (IndexType k = 0; k < NumRegs; ++k)
+            vec_add_dpbusd_32x2(acc[k], in0, col0[k], in1, col1[k]);
+        }
+
+        vec_t* outptr = reinterpret_cast<vec_t*>(output);
+        for (IndexType k = 0; k < NumRegs; ++k)
+          outptr[k] = acc[k];
+      }
+      else if constexpr (OutputDimensions == 1)
+      {
+        constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+        vec_t sum0 = vec_setzero();
+        const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
+
+        for (int j = 0; j < (int)NumChunks; ++j)
+        {
+          const vec_t in = inputVector[j];
+          vec_add_dpbusd_32(sum0, in, row0[j]);
+        }
+        output[0] = vec_hadd(sum0, biases[0]);
+      }
+
+# undef vec_setzero
+# undef vec_set_32
+# undef vec_add_dpbusd_32
+# undef vec_add_dpbusd_32x2
+# undef vec_add_dpbusd_32x4
+# undef vec_hadd
+# undef vec_haddx4
+#else
+      // Use old implementation for the other architectures.
+      affine_transform_non_ssse3<
+        InputDimensions,
+        PaddedInputDimensions,
+        OutputDimensions>(output, weights, biases, input);
+#endif
+
+      return output;
+    }
+
+   private:
+    using BiasType = OutputType;
+    using WeightType = std::int8_t;

    alignas(CacheLineSize) BiasType biases[OutputDimensions];
    alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/layers/clipped_relu.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/layers/clipped_relu.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -26,50 +26,41 @@
 namespace Stockfish::Eval::NNUE::Layers {

  // Clipped ReLU
-  template <typename PreviousLayer>
+  template <IndexType InDims>
  class ClippedReLU {
   public:
    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
+    using InputType = std::int32_t;
    using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");

    // Number of input/output dimensions
-    static constexpr IndexType InputDimensions =
-        PreviousLayer::OutputDimensions;
+    static constexpr IndexType InputDimensions = InDims;
    static constexpr IndexType OutputDimensions = InputDimensions;
+    static constexpr IndexType PaddedOutputDimensions =
+        ceil_to_multiple<IndexType>(OutputDimensions, 32);

-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t SelfBufferSize =
-        ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
-
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t BufferSize =
-        PreviousLayer::BufferSize + SelfBufferSize;
+    using OutputBuffer = OutputType[PaddedOutputDimensions];

    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value() {
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
      std::uint32_t hashValue = 0x538D24C7u;
-      hashValue += PreviousLayer::get_hash_value();
+      hashValue += prevHash;
      return hashValue;
    }

    // Read network parameters
-    bool read_parameters(std::istream& stream) {
-      return previousLayer.read_parameters(stream);
+    bool read_parameters(std::istream&) {
+      return true;
    }

    // Write network parameters
-    bool write_parameters(std::ostream& stream) const {
-      return previousLayer.write_parameters(stream);
+    bool write_parameters(std::ostream&) const {
+      return true;
    }

    // Forward propagation
    const OutputType* propagate(
-        const TransformedFeatureType* transformedFeatures, char* buffer) const {
-      const auto input = previousLayer.propagate(
-          transformedFeatures, buffer + SelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        const InputType* input, OutputType* output) const {

  #if defined(USE_AVX2)
      if constexpr (InputDimensions % SimdWidth == 0) {
@ -179,11 +170,9 @@ namespace Stockfish::Eval::NNUE::Layers {
        output[i] = static_cast<OutputType>(
            std::max(0, std::min(127, input[i] >> WeightScaleBits)));
      }
+
      return output;
    }
-
-   private:
-    PreviousLayer previousLayer;
  };

 }  // namespace Stockfish::Eval::NNUE::Layers
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/layers/input_slice.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/layers/input_slice.h
@ -1,73 +0,0 @@
-/*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-// NNUE evaluation function layer InputSlice definition
-
-#ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
-#define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
-
-#include "../nnue_common.h"
-
-namespace Stockfish::Eval::NNUE::Layers {
-
-// Input layer
-template <IndexType OutDims, IndexType Offset = 0>
-class InputSlice {
- public:
-  // Need to maintain alignment
-  static_assert(Offset % MaxSimdWidth == 0, "");
-
-  // Output type
-  using OutputType = TransformedFeatureType;
-
-  // Output dimensionality
-  static constexpr IndexType OutputDimensions = OutDims;
-
-  // Size of forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t BufferSize = 0;
-
-  // Hash value embedded in the evaluation file
-  static constexpr std::uint32_t get_hash_value() {
-    std::uint32_t hashValue = 0xEC42E90Du;
-    hashValue ^= OutputDimensions ^ (Offset << 10);
-    return hashValue;
-  }
-
-  // Read network parameters
-  bool read_parameters(std::istream& /*stream*/) {
-    return true;
-  }
-
-  // Write network parameters
-  bool write_parameters(std::ostream& /*stream*/) const {
-    return true;
-  }
-
-  // Forward propagation
-  const OutputType* propagate(
-      const TransformedFeatureType* transformedFeatures,
-      char* /*buffer*/) const {
-    return transformedFeatures + Offset;
-  }
-
- private:
-};
-
-}  // namespace Stockfish::Eval::NNUE::Layers
-
-#endif // #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_accumulator.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_accumulator.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_architecture.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_architecture.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -21,39 +21,112 @@
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED

+#include <memory>
+
 #include "nnue_common.h"

-#include "features/half_ka_v2.h"
+#include "features/half_ka_v2_hm.h"

-#include "layers/input_slice.h"
 #include "layers/affine_transform.h"
 #include "layers/clipped_relu.h"

+#include "../misc.h"
+
 namespace Stockfish::Eval::NNUE {

-  // Input features used in evaluation function
-  using FeatureSet = Features::HalfKAv2;
+// Input features used in evaluation function
+using FeatureSet = Features::HalfKAv2_hm;

-  // Number of input feature dimensions after conversion
-  constexpr IndexType TransformedFeatureDimensions = 512;
-  constexpr IndexType PSQTBuckets = 8;
-  constexpr IndexType LayerStacks = 8;
+// Number of input feature dimensions after conversion
+constexpr IndexType TransformedFeatureDimensions = 1024;
+constexpr IndexType PSQTBuckets = 8;
+constexpr IndexType LayerStacks = 8;

-  namespace Layers {
+struct Network
+{
+  static constexpr int FC_0_OUTPUTS = 15;
+  static constexpr int FC_1_OUTPUTS = 32;

-    // Define network structure
-    using InputLayer = InputSlice<TransformedFeatureDimensions * 2>;
-    using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 16>>;
-    using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-    using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+  Layers::AffineTransform<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
+  Layers::ClippedReLU<FC_0_OUTPUTS + 1> ac_0;
+  Layers::AffineTransform<FC_0_OUTPUTS, FC_1_OUTPUTS> fc_1;
+  Layers::ClippedReLU<FC_1_OUTPUTS> ac_1;
+  Layers::AffineTransform<FC_1_OUTPUTS, 1> fc_2;

-  }  // namespace Layers
+  // Hash value embedded in the evaluation file
+  static constexpr std::uint32_t get_hash_value() {
+    // input slice hash
+    std::uint32_t hashValue = 0xEC42E90Du;
+    hashValue ^= TransformedFeatureDimensions * 2;

-  using Network = Layers::OutputLayer;
+    hashValue = decltype(fc_0)::get_hash_value(hashValue);
+    hashValue = decltype(ac_0)::get_hash_value(hashValue);
+    hashValue = decltype(fc_1)::get_hash_value(hashValue);
+    hashValue = decltype(ac_1)::get_hash_value(hashValue);
+    hashValue = decltype(fc_2)::get_hash_value(hashValue);

-  static_assert(TransformedFeatureDimensions % MaxSimdWidth == 0, "");
-  static_assert(Network::OutputDimensions == 1, "");
-  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+    return hashValue;
+  }
+
+  // Read network parameters
+  bool read_parameters(std::istream& stream) {
+    if (!fc_0.read_parameters(stream)) return false;
+    if (!ac_0.read_parameters(stream)) return false;
+    if (!fc_1.read_parameters(stream)) return false;
+    if (!ac_1.read_parameters(stream)) return false;
+    if (!fc_2.read_parameters(stream)) return false;
+    return true;
+  }
+
+  // Read network parameters
+  bool write_parameters(std::ostream& stream) const {
+    if (!fc_0.write_parameters(stream)) return false;
+    if (!ac_0.write_parameters(stream)) return false;
+    if (!fc_1.write_parameters(stream)) return false;
+    if (!ac_1.write_parameters(stream)) return false;
+    if (!fc_2.write_parameters(stream)) return false;
+    return true;
+  }
+
+  std::int32_t propagate(const TransformedFeatureType* transformedFeatures)
+  {
+    struct alignas(CacheLineSize) Buffer
+    {
+      alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out;
+      alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out;
+      alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out;
+      alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out;
+      alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out;
+
+      Buffer()
+      {
+          std::memset(this, 0, sizeof(*this));
+      }
+    };
+
+#if defined(__clang__) && (__APPLE__)
+    // workaround for a bug reported with xcode 12
+    static thread_local auto tlsBuffer = std::make_unique<Buffer>();
+    // Access TLS only once, cache result.
+    Buffer& buffer = *tlsBuffer;
+#else
+    alignas(CacheLineSize) static thread_local Buffer buffer;
+#endif
+
+    fc_0.propagate(transformedFeatures, buffer.fc_0_out);
+    ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out);
+    fc_1.propagate(buffer.ac_0_out, buffer.fc_1_out);
+    ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out);
+    fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out);
+
+    // buffer.fc_0_out[FC_0_OUTPUTS] is such that 1.0 is equal to 127*(1<<WeightScaleBits) in quantized form
+    // but we want 1.0 to be equal to 600*OutputScale
+    std::int32_t fwdOut = int(buffer.fc_0_out[FC_0_OUTPUTS]) * (600*OutputScale) / (127*(1<<WeightScaleBits));
+    std::int32_t outputValue = buffer.fc_2_out[0] + fwdOut;
+
+    return outputValue;
+  }
+};

 }  // namespace Stockfish::Eval::NNUE

--- a/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_common.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_common.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -109,7 +109,7 @@ namespace Stockfish::Eval::NNUE {

  // write_little_endian() is our utility to write an integer (signed or unsigned, any size)
  // to a stream in little-endian order. We swap the byte order before the write if
-  // necessary to always write in little endian order, independantly of the byte
+  // necessary to always write in little endian order, independently of the byte
  // ordering of the compiling machine.
  template <typename IntType>
  inline void write_little_endian(std::ostream& stream, IntType value) {
@ -127,11 +127,11 @@ namespace Stockfish::Eval::NNUE {
          {
            for (; i + 1 < sizeof(IntType); ++i)
            {
-                u[i] = v;
+                u[i] = (std::uint8_t)v;
                v >>= 8;
            }
          }
-          u[i] = v;
+          u[i] = (std::uint8_t)v;

          stream.write(reinterpret_cast<char*>(u), sizeof(IntType));
      }
--- a/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_feature_transformer.h
+++ b/DroidFishApp/src/main/cpp/stockfish/nnue/nnue_feature_transformer.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -47,12 +47,22 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) _mm512_store_si512(a,b)
  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
+  #define vec_zero() _mm512_setzero_epi32()
+  #define vec_set_16(a) _mm512_set1_epi16(a)
+  #define vec_max_16(a,b) _mm512_max_epi16(a,b)
+  #define vec_min_16(a,b) _mm512_min_epi16(a,b)
+  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
+    vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
+    return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
+  }
  #define vec_load_psqt(a) _mm256_load_si256(a)
  #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
  #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
  #define vec_zero_psqt() _mm256_setzero_si256()
  #define NumRegistersSIMD 32
+  #define MaxChunkSize 64

  #elif USE_AVX2
  typedef __m256i vec_t;
@ -61,12 +71,22 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) _mm256_store_si256(a,b)
  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
+  #define vec_zero() _mm256_setzero_si256()
+  #define vec_set_16(a) _mm256_set1_epi16(a)
+  #define vec_max_16(a,b) _mm256_max_epi16(a,b)
+  #define vec_min_16(a,b) _mm256_min_epi16(a,b)
+  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
+    vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
+    return _mm256_permute4x64_epi64(compacted, 0b11011000);
+  }
  #define vec_load_psqt(a) _mm256_load_si256(a)
  #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
  #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
  #define vec_zero_psqt() _mm256_setzero_si256()
  #define NumRegistersSIMD 16
+  #define MaxChunkSize 32

  #elif USE_SSE2
  typedef __m128i vec_t;
@ -75,12 +95,19 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
+  #define vec_zero() _mm_setzero_si128()
+  #define vec_set_16(a) _mm_set1_epi16(a)
+  #define vec_max_16(a,b) _mm_max_epi16(a,b)
+  #define vec_min_16(a,b) _mm_min_epi16(a,b)
+  #define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
  #define vec_load_psqt(a) (*(a))
  #define vec_store_psqt(a,b) *(a)=(b)
  #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
  #define vec_zero_psqt() _mm_setzero_si128()
  #define NumRegistersSIMD (Is64Bit ? 16 : 8)
+  #define MaxChunkSize 16

  #elif USE_MMX
  typedef __m64 vec_t;
@ -89,12 +116,26 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_pi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
+  #define vec_zero() _mm_setzero_si64()
+  #define vec_set_16(a) _mm_set1_pi16(a)
+  inline vec_t vec_max_16(vec_t a,vec_t b){
+      vec_t comparison = _mm_cmpgt_pi16(a,b);
+      return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
+  }
+  inline vec_t vec_min_16(vec_t a,vec_t b){
+      vec_t comparison = _mm_cmpgt_pi16(a,b);
+      return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
+  }
+  #define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
  #define vec_load_psqt(a) (*(a))
  #define vec_store_psqt(a,b) *(a)=(b)
  #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
  #define vec_zero_psqt() _mm_setzero_si64()
+  #define vec_cleanup() _mm_empty()
  #define NumRegistersSIMD 8
+  #define MaxChunkSize 8

  #elif USE_NEON
  typedef int16x8_t vec_t;
@ -103,12 +144,24 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) vaddq_s16(a,b)
  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_mul_16(a,b) vmulq_s16(a,b)
+  #define vec_zero() vec_t{0}
+  #define vec_set_16(a) vdupq_n_s16(a)
+  #define vec_max_16(a,b) vmaxq_s16(a,b)
+  #define vec_min_16(a,b) vminq_s16(a,b)
+  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
+        const int8x8_t shifta = vshrn_n_s16(a, 7);
+        const int8x8_t shiftb = vshrn_n_s16(b, 7);
+	const int8x16_t compacted = vcombine_s8(shifta,shiftb);
+	return *reinterpret_cast<const vec_t*> (&compacted);
+  }
  #define vec_load_psqt(a) (*(a))
  #define vec_store_psqt(a,b) *(a)=(b)
  #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
  #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
  #define vec_zero_psqt() psqt_vec_t{0}
  #define NumRegistersSIMD 16
+  #define MaxChunkSize 16

  #else
  #undef VECTOR
@ -123,8 +176,10 @@ namespace Stockfish::Eval::NNUE {
      // We use __m* types as template arguments, which causes GCC to emit warnings
      // about losing some attribute information. This is irrelevant to us as we
      // only take their size, so the following pragma are harmless.
+      #if defined(__GNUC__)
      #pragma GCC diagnostic push
      #pragma GCC diagnostic ignored "-Wignored-attributes"
+      #endif

      template <typename SIMDRegisterType,
                typename LaneType,
@ -156,9 +211,9 @@ namespace Stockfish::Eval::NNUE {

      static constexpr int NumRegs     = BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
      static constexpr int NumPsqtRegs = BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
-
+      #if defined(__GNUC__)
      #pragma GCC diagnostic pop
-
+      #endif
  #endif


@ -183,7 +238,7 @@ namespace Stockfish::Eval::NNUE {

    // Number of input/output dimensions
    static constexpr IndexType InputDimensions = FeatureSet::Dimensions;
-    static constexpr IndexType OutputDimensions = HalfDimensions * 2;
+    static constexpr IndexType OutputDimensions = HalfDimensions;

    // Size of forward propagation buffer
    static constexpr std::size_t BufferSize =
@ -191,7 +246,7 @@ namespace Stockfish::Eval::NNUE {

    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t get_hash_value() {
-      return FeatureSet::HashValue ^ OutputDimensions;
+      return FeatureSet::HashValue ^ (OutputDimensions * 2);
    }

    // Read network parameters
@ -229,136 +284,55 @@ namespace Stockfish::Eval::NNUE {
        ) / 2;


-  #if defined(USE_AVX512)
-
-      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
-      static_assert(HalfDimensions % (SimdWidth * 2) == 0);
-      const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-      const __m512i Zero = _mm512_setzero_si512();
-
      for (IndexType p = 0; p < 2; ++p)
      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m512i*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
+          const IndexType offset = (HalfDimensions / 2) * p;
+
+#if defined(VECTOR)
+
+	  constexpr IndexType OutputChunkSize = MaxChunkSize;
+          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
+
+          vec_t Zero = vec_zero();
+          vec_t One = vec_set_16(127);
+
+          const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
+          const vec_t* in1 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+                vec_t* out = reinterpret_cast<      vec_t*>(output + offset);
+
+          for (IndexType j = 0; j < NumOutputChunks; j += 1)
          {
-              __m512i sum0 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 0]);
-              __m512i sum1 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 1]);
+              const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero);
+              const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero);
+              const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero);
+              const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero);

-              _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
-                                 _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
+              const vec_t pa = vec_mul_16(sum0a, sum1a);
+              const vec_t pb = vec_mul_16(sum0b, sum1b);
+
+              out[j] = vec_msb_pack_16(pa, pb);
          }
-      }
-      return psqt;

-  #elif defined(USE_AVX2)
+#else

-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      constexpr int Control = 0b11011000;
-      const __m256i Zero = _mm256_setzero_si256();
-
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m256i*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
-          {
-              __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 0]);
-              __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 1]);
-
-              _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(
-                                 _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control));
+          for (IndexType j = 0; j < HalfDimensions / 2; ++j) {
+              BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
+              BiasType sum1 = accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
+              sum0 = std::max<int>(0, std::min<int>(127, sum0));
+              sum1 = std::max<int>(0, std::min<int>(127, sum1));
+              output[offset + j] = static_cast<OutputType>(sum0 * sum1 / 128);
          }
+
+#endif
      }
+
+#if defined(vec_cleanup)
+      vec_cleanup();
+#endif
+
      return psqt;

-  #elif defined(USE_SSE2)
-
-      #ifdef USE_SSE41
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      const __m128i Zero = _mm_setzero_si128();
-      #else
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-      #endif
-
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m128i*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
-          {
-              __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>
-                                           (accumulation[perspectives[p]])[j * 2 + 0]);
-              __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>
-                                           (accumulation[perspectives[p]])[j * 2 + 1]);
-              const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-              #ifdef USE_SSE41
-              _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero));
-              #else
-              _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
-              #endif
-          }
-      }
-      return psqt;
-
-  #elif defined(USE_MMX)
-
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m64*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
-          {
-              __m64 sum0 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 0]);
-              __m64 sum1 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 1]);
-              const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-              out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-          }
-      }
-      _mm_empty();
-      return psqt;
-
-  #elif defined(USE_NEON)
-
-      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
-      const int8x8_t Zero = {0};
-
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
-          {
-              int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
-              out[j] = vmax_s8(vqmovn_s16(sum), Zero);
-          }
-      }
-      return psqt;
-
-  #else
-
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          for (IndexType j = 0; j < HalfDimensions; ++j)
-          {
-              BiasType sum = accumulation[perspectives[p]][j];
-              output[offset + j] = static_cast<OutputType>(std::max<int>(0, std::min<int>(127, sum)));
-          }
-      }
-      return psqt;
-
-  #endif
-
   } // end of function transform()


@ -370,7 +344,6 @@ namespace Stockfish::Eval::NNUE {
      // That might depend on the feature set and generally relies on the
      // feature set's update cost calculation to be correct and never
      // allow updates with more added/removed features than MaxActiveDimensions.
-      using IndexList = ValueList<IndexType, FeatureSet::MaxActiveDimensions>;

  #ifdef VECTOR
      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
@ -404,12 +377,12 @@ namespace Stockfish::Eval::NNUE {

        // Gather all features to be updated.
        const Square ksq = pos.square<KING>(perspective);
-        IndexList removed[2], added[2];
+        FeatureSet::IndexList removed[2], added[2];
        FeatureSet::append_changed_indices(
-          ksq, next, perspective, removed[0], added[0]);
+          ksq, next->dirtyPiece, perspective, removed[0], added[0]);
        for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous)
          FeatureSet::append_changed_indices(
-            ksq, st2, perspective, removed[1], added[1]);
+            ksq, st2->dirtyPiece, perspective, removed[1], added[1]);

        // Mark the accumulators as computed.
        next->accumulator.computed[perspective] = true;
@ -534,7 +507,7 @@ namespace Stockfish::Eval::NNUE {
        // Refresh the accumulator
        auto& accumulator = pos.state()->accumulator;
        accumulator.computed[perspective] = true;
-        IndexList active;
+        FeatureSet::IndexList active;
        FeatureSet::append_active_indices(pos, perspective, active);

  #ifdef VECTOR
--- a/DroidFishApp/src/main/cpp/stockfish/pawns.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/pawns.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -32,30 +32,30 @@ namespace {
  #define S(mg, eg) make_score(mg, eg)

  // Pawn penalties
-  constexpr Score Backward      = S( 9, 22);
-  constexpr Score Doubled       = S(13, 51);
-  constexpr Score DoubledEarly  = S(20,  7);
-  constexpr Score Isolated      = S( 3, 15);
-  constexpr Score WeakLever     = S( 4, 58);
-  constexpr Score WeakUnopposed = S(13, 24);
+  constexpr Score Backward      = S( 6, 19);
+  constexpr Score Doubled       = S(11, 51);
+  constexpr Score DoubledEarly  = S(17,  7);
+  constexpr Score Isolated      = S( 1, 20);
+  constexpr Score WeakLever     = S( 2, 57);
+  constexpr Score WeakUnopposed = S(15, 18);

  // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-17, -6), S(-9, 2) };
+  constexpr Score BlockedPawn[2] = { S(-19, -8), S(-7, 3) };

  constexpr Score BlockedStorm[RANK_NB] = {
-    S(0, 0), S(0, 0), S(75, 78), S(-8, 16), S(-6, 10), S(-6, 6), S(0, 2)
+    S(0, 0), S(0, 0), S(64, 75), S(-3, 14), S(-12, 19), S(-7, 4), S(-10, 5)
  };

  // Connected pawn bonus
-  constexpr int Connected[RANK_NB] = { 0, 5, 7, 11, 23, 48, 87 };
+  constexpr int Connected[RANK_NB] = { 0, 3, 7, 7, 15, 54, 86 };

  // Strength of pawn shelter for our king by [distance from edge][rank].
  // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
  constexpr Value ShelterStrength[int(FILE_NB) / 2][RANK_NB] = {
-    { V( -5), V( 82), V( 92), V( 54), V( 36), V( 22), V(  28) },
-    { V(-44), V( 63), V( 33), V(-50), V(-30), V(-12), V( -62) },
-    { V(-11), V( 77), V( 22), V( -6), V( 31), V(  8), V( -45) },
-    { V(-39), V(-12), V(-29), V(-50), V(-43), V(-68), V(-164) }
+    { V(-2), V(85), V(95), V(53), V(39), V(23), V(25) },
+    { V(-55), V(64), V(32), V(-55), V(-30), V(-11), V(-61) },
+    { V(-11), V(75), V(19), V(-6), V(26), V(9), V(-47) },
+    { V(-41), V(-11), V(-27), V(-58), V(-42), V(-66), V(-163) }
  };

  // Danger of enemy pawns moving toward our king by [distance from edge][rank].
@ -63,17 +63,17 @@ namespace {
  // is behind our king. Note that UnblockedStorm[0][1-2] accommodate opponent pawn
  // on edge, likely blocked by our king.
  constexpr Value UnblockedStorm[int(FILE_NB) / 2][RANK_NB] = {
-    { V( 87), V(-288), V(-168), V( 96), V( 47), V( 44), V( 46) },
-    { V( 42), V( -25), V( 120), V( 45), V( 34), V( -9), V( 24) },
-    { V( -8), V(  51), V( 167), V( 35), V( -4), V(-16), V(-12) },
-    { V(-17), V( -13), V( 100), V(  4), V(  9), V(-16), V(-31) }
+    { V(94), V(-280), V(-170), V(90), V(59), V(47), V(53) },
+    { V(43), V(-17), V(128), V(39), V(26), V(-17), V(15) },
+    { V(-9), V(62), V(170), V(34), V(-5), V(-20), V(-11) },
+    { V(-27), V(-19), V(106), V(10), V(2), V(-13), V(-24) }
  };


  // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties
  // for king when the king is on a semi-open or open file.
-  constexpr Score KingOnFile[2][2] = {{ S(-21,10), S(-7, 1)  },
-                                     {  S(  0,-3), S( 9,-4) }};
+  constexpr Score KingOnFile[2][2] = {{ S(-18,11), S(-6,-3)  },
+                                     {  S(  0, 0), S( 5,-4) }};

  #undef S
  #undef V
--- a/DroidFishApp/src/main/cpp/stockfish/pawns.h
+++ b/DroidFishApp/src/main/cpp/stockfish/pawns.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/position.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/position.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -1013,9 +1013,9 @@ void Position::do_null_move(StateInfo& newSt) {
  }

  st->key ^= Zobrist::side;
+  ++st->rule50;
  prefetch(TT.first_entry(key()));

-  ++st->rule50;
  st->pliesFromNull = 0;

  sideToMove = ~sideToMove;
@ -1080,8 +1080,9 @@ bool Position::see_ge(Move m, Value threshold) const {
  if (swap <= 0)
      return true;

+  assert(color_of(piece_on(from)) == sideToMove);
  Bitboard occupied = pieces() ^ from ^ to;
-  Color stm = color_of(piece_on(from));
+  Color stm = sideToMove;
  Bitboard attackers = attackers_to(to, occupied);
  Bitboard stmAttackers, bb;
  int res = 1;
--- a/DroidFishApp/src/main/cpp/stockfish/position.h
+++ b/DroidFishApp/src/main/cpp/stockfish/position.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -120,12 +120,12 @@ public:
  Bitboard attackers_to(Square s) const;
  Bitboard attackers_to(Square s, Bitboard occupied) const;
  Bitboard slider_blockers(Bitboard sliders, Square s, Bitboard& pinners) const;
+  template<PieceType Pt> Bitboard attacks_by(Color c) const;

  // Properties of moves
  bool legal(Move m) const;
  bool pseudo_legal(const Move m) const;
  bool capture(Move m) const;
-  bool capture_or_promotion(Move m) const;
  bool gives_check(Move m) const;
  Piece moved_piece(Move m) const;
  Piece captured_piece() const;
@ -285,6 +285,22 @@ inline Bitboard Position::attackers_to(Square s) const {
  return attackers_to(s, pieces());
 }

+template<PieceType Pt>
+inline Bitboard Position::attacks_by(Color c) const {
+
+  if constexpr (Pt == PAWN)
+      return c == WHITE ? pawn_attacks_bb<WHITE>(pieces(WHITE, PAWN))
+                        : pawn_attacks_bb<BLACK>(pieces(BLACK, PAWN));
+  else
+  {
+      Bitboard threats = 0;
+      Bitboard attackers = pieces(c, Pt);
+      while (attackers)
+          threats |= attacks_bb<Pt>(pop_lsb(attackers), pieces());
+      return threats;
+  }
+}
+
 inline Bitboard Position::checkers() const {
  return st->checkersBB;
 }
@ -352,11 +368,6 @@ inline bool Position::is_chess960() const {
  return chess960;
 }

-inline bool Position::capture_or_promotion(Move m) const {
-  assert(is_ok(m));
-  return type_of(m) != NORMAL ? type_of(m) != CASTLING : !empty(to_sq(m));
-}
-
 inline bool Position::capture(Move m) const {
  assert(is_ok(m));
  // Castling is encoded as "king captures rook"
--- a/DroidFishApp/src/main/cpp/stockfish/psqt.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/psqt.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/psqt.h
+++ b/DroidFishApp/src/main/cpp/stockfish/psqt.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/search.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/search.cpp
--- a/DroidFishApp/src/main/cpp/stockfish/search.h
+++ b/DroidFishApp/src/main/cpp/stockfish/search.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -47,6 +47,7 @@ struct Stack {
  Move excludedMove;
  Move killers[2];
  Value staticEval;
+  Depth depth;
  int statScore;
  int moveCount;
  bool inCheck;
@ -72,6 +73,7 @@ struct RootMove {

  Value score = -VALUE_INFINITE;
  Value previousScore = -VALUE_INFINITE;
+  Value averageScore = -VALUE_INFINITE;
  int selDepth = 0;
  int tbRank = 0;
  Value tbScore;
--- a/DroidFishApp/src/main/cpp/stockfish/simd.h
+++ b/DroidFishApp/src/main/cpp/stockfish/simd.h
@ -0,0 +1,387 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef STOCKFISH_SIMD_H_INCLUDED
+#define STOCKFISH_SIMD_H_INCLUDED
+
+#if defined(USE_AVX2)
+# include <immintrin.h>
+
+#elif defined(USE_SSE41)
+# include <smmintrin.h>
+
+#elif defined(USE_SSSE3)
+# include <tmmintrin.h>
+
+#elif defined(USE_SSE2)
+# include <emmintrin.h>
+
+#elif defined(USE_MMX)
+# include <mmintrin.h>
+
+#elif defined(USE_NEON)
+# include <arm_neon.h>
+#endif
+
+// The inline asm is only safe for GCC, where it is necessary to get good codegen.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101693
+// Clang does fine without it.
+// Play around here: https://godbolt.org/z/7EWqrYq51
+#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER))
+#define USE_INLINE_ASM
+#endif
+
+// Use either the AVX512 or AVX-VNNI version of the VNNI instructions.
+#if defined(USE_AVXVNNI)
+#define VNNI_PREFIX "%{vex%} "
+#else
+#define VNNI_PREFIX ""
+#endif
+
+namespace Stockfish::Simd {
+
+#if defined (USE_AVX512)
+
+    [[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
+      return _mm512_reduce_add_epi32(sum) + bias;
+    }
+
+    /*
+      Parameters:
+        sum0 = [zmm0.i128[0], zmm0.i128[1], zmm0.i128[2], zmm0.i128[3]]
+        sum1 = [zmm1.i128[0], zmm1.i128[1], zmm1.i128[2], zmm1.i128[3]]
+        sum2 = [zmm2.i128[0], zmm2.i128[1], zmm2.i128[2], zmm2.i128[3]]
+        sum3 = [zmm3.i128[0], zmm3.i128[1], zmm3.i128[2], zmm3.i128[3]]
+
+      Returns:
+        ret = [
+          reduce_add_epi32(zmm0.i128[0]), reduce_add_epi32(zmm1.i128[0]), reduce_add_epi32(zmm2.i128[0]), reduce_add_epi32(zmm3.i128[0]),
+          reduce_add_epi32(zmm0.i128[1]), reduce_add_epi32(zmm1.i128[1]), reduce_add_epi32(zmm2.i128[1]), reduce_add_epi32(zmm3.i128[1]),
+          reduce_add_epi32(zmm0.i128[2]), reduce_add_epi32(zmm1.i128[2]), reduce_add_epi32(zmm2.i128[2]), reduce_add_epi32(zmm3.i128[2]),
+          reduce_add_epi32(zmm0.i128[3]), reduce_add_epi32(zmm1.i128[3]), reduce_add_epi32(zmm2.i128[3]), reduce_add_epi32(zmm3.i128[3])
+        ]
+    */
+    [[maybe_unused]] static __m512i m512_hadd128x16_interleave(
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) {
+
+      __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+      __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+      __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+      __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+      __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+      __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+      __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+      __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+      return _mm512_add_epi32(sum0123a, sum0123b);
+    }
+
+    [[maybe_unused]] static __m128i m512_haddx4(
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m128i bias) {
+
+      __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+      __m256i sum256lo = _mm512_castsi512_si256(sum);
+      __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+      sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+      __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+      __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+      return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+    }
+
+    [[maybe_unused]] static void m512_add_dpbusd_epi32(
+        __m512i& acc,
+        __m512i a,
+        __m512i b) {
+
+# if defined (USE_VNNI)
+#   if defined (USE_INLINE_ASM)
+      asm(
+        "vpdpbusd %[b], %[a], %[acc]\n\t"
+        : [acc]"+v"(acc)
+        : [a]"v"(a), [b]"vm"(b)
+      );
+#   else
+      acc = _mm512_dpbusd_epi32(acc, a, b);
+#   endif
+# else
+#   if defined (USE_INLINE_ASM)
+      __m512i tmp = _mm512_maddubs_epi16(a, b);
+      asm(
+          "vpmaddwd    %[tmp], %[ones], %[tmp]\n\t"
+          "vpaddd      %[acc], %[tmp], %[acc]\n\t"
+          : [acc]"+v"(acc), [tmp]"+&v"(tmp)
+          : [ones]"v"(_mm512_set1_epi16(1))
+      );
+#   else
+      __m512i product0 = _mm512_maddubs_epi16(a, b);
+      product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
+      acc = _mm512_add_epi32(acc, product0);
+#   endif
+# endif
+    }
+
+    [[maybe_unused]] static void m512_add_dpbusd_epi32x2(
+        __m512i& acc,
+        __m512i a0, __m512i b0,
+        __m512i a1, __m512i b1) {
+
+# if defined (USE_VNNI)
+#   if defined (USE_INLINE_ASM)
+      asm(
+        "vpdpbusd %[b0], %[a0], %[acc]\n\t"
+        "vpdpbusd %[b1], %[a1], %[acc]\n\t"
+        : [acc]"+v"(acc)
+        : [a0]"v"(a0), [b0]"vm"(b0), [a1]"v"(a1), [b1]"vm"(b1)
+      );
+#   else
+      acc = _mm512_dpbusd_epi32(acc, a0, b0);
+      acc = _mm512_dpbusd_epi32(acc, a1, b1);
+#   endif
+# else
+#   if defined (USE_INLINE_ASM)
+      __m512i tmp0 = _mm512_maddubs_epi16(a0, b0);
+      __m512i tmp1 = _mm512_maddubs_epi16(a1, b1);
+      asm(
+          "vpaddsw     %[tmp0], %[tmp1], %[tmp0]\n\t"
+          "vpmaddwd    %[tmp0], %[ones], %[tmp0]\n\t"
+          "vpaddd      %[acc], %[tmp0], %[acc]\n\t"
+          : [acc]"+v"(acc), [tmp0]"+&v"(tmp0)
+          : [tmp1]"v"(tmp1), [ones]"v"(_mm512_set1_epi16(1))
+      );
+#   else
+      __m512i product0 = _mm512_maddubs_epi16(a0, b0);
+      __m512i product1 = _mm512_maddubs_epi16(a1, b1);
+      product0 = _mm512_adds_epi16(product0, product1);
+      product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
+      acc = _mm512_add_epi32(acc, product0);
+#   endif
+# endif
+    }
+
+#endif
+
+#if defined (USE_AVX2)
+
+    [[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
+      __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+      sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+      sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+      return _mm_cvtsi128_si32(sum128) + bias;
+    }
+
+    [[maybe_unused]] static __m128i m256_haddx4(
+        __m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3,
+        __m128i bias) {
+
+      sum0 = _mm256_hadd_epi32(sum0, sum1);
+      sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+      sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+      __m128i sum128lo = _mm256_castsi256_si128(sum0);
+      __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+      return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+    }
+
+    [[maybe_unused]] static void m256_add_dpbusd_epi32(
+        __m256i& acc,
+        __m256i a,
+        __m256i b) {
+
+# if defined (USE_VNNI)
+#   if defined (USE_INLINE_ASM)
+      asm(
+        VNNI_PREFIX "vpdpbusd %[b], %[a], %[acc]\n\t"
+        : [acc]"+v"(acc)
+        : [a]"v"(a), [b]"vm"(b)
+      );
+#   else
+      acc = _mm256_dpbusd_epi32(acc, a, b);
+#   endif
+# else
+#   if defined (USE_INLINE_ASM)
+      __m256i tmp = _mm256_maddubs_epi16(a, b);
+      asm(
+          "vpmaddwd    %[tmp], %[ones], %[tmp]\n\t"
+          "vpaddd      %[acc], %[tmp], %[acc]\n\t"
+          : [acc]"+v"(acc), [tmp]"+&v"(tmp)
+          : [ones]"v"(_mm256_set1_epi16(1))
+      );
+#   else
+      __m256i product0 = _mm256_maddubs_epi16(a, b);
+      product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
+      acc = _mm256_add_epi32(acc, product0);
+#   endif
+# endif
+    }
+
+    [[maybe_unused]] static void m256_add_dpbusd_epi32x2(
+        __m256i& acc,
+        __m256i a0, __m256i b0,
+        __m256i a1, __m256i b1) {
+
+# if defined (USE_VNNI)
+#   if defined (USE_INLINE_ASM)
+      asm(
+        VNNI_PREFIX "vpdpbusd %[b0], %[a0], %[acc]\n\t"
+        VNNI_PREFIX "vpdpbusd %[b1], %[a1], %[acc]\n\t"
+        : [acc]"+v"(acc)
+        : [a0]"v"(a0), [b0]"vm"(b0), [a1]"v"(a1), [b1]"vm"(b1)
+      );
+#   else
+      acc = _mm256_dpbusd_epi32(acc, a0, b0);
+      acc = _mm256_dpbusd_epi32(acc, a1, b1);
+#   endif
+# else
+#   if defined (USE_INLINE_ASM)
+      __m256i tmp0 = _mm256_maddubs_epi16(a0, b0);
+      __m256i tmp1 = _mm256_maddubs_epi16(a1, b1);
+      asm(
+          "vpaddsw     %[tmp0], %[tmp1], %[tmp0]\n\t"
+          "vpmaddwd    %[tmp0], %[ones], %[tmp0]\n\t"
+          "vpaddd      %[acc], %[tmp0], %[acc]\n\t"
+          : [acc]"+v"(acc), [tmp0]"+&v"(tmp0)
+          : [tmp1]"v"(tmp1), [ones]"v"(_mm256_set1_epi16(1))
+      );
+#   else
+      __m256i product0 = _mm256_maddubs_epi16(a0, b0);
+      __m256i product1 = _mm256_maddubs_epi16(a1, b1);
+      product0 = _mm256_adds_epi16(product0, product1);
+      product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
+      acc = _mm256_add_epi32(acc, product0);
+#   endif
+# endif
+    }
+
+#endif
+
+#if defined (USE_SSSE3)
+
+    [[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
+      sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+      sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+      return _mm_cvtsi128_si32(sum) + bias;
+    }
+
+    [[maybe_unused]] static __m128i m128_haddx4(
+        __m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3,
+        __m128i bias) {
+
+      sum0 = _mm_hadd_epi32(sum0, sum1);
+      sum2 = _mm_hadd_epi32(sum2, sum3);
+      sum0 = _mm_hadd_epi32(sum0, sum2);
+      return _mm_add_epi32(sum0, bias);
+    }
+
+    [[maybe_unused]] static void m128_add_dpbusd_epi32(
+        __m128i& acc,
+        __m128i a,
+        __m128i b) {
+
+#   if defined (USE_INLINE_ASM)
+      __m128i tmp = _mm_maddubs_epi16(a, b);
+      asm(
+          "pmaddwd    %[ones], %[tmp]\n\t"
+          "paddd      %[tmp], %[acc]\n\t"
+          : [acc]"+v"(acc), [tmp]"+&v"(tmp)
+          : [ones]"v"(_mm_set1_epi16(1))
+      );
+#   else
+      __m128i product0 = _mm_maddubs_epi16(a, b);
+      product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
+      acc = _mm_add_epi32(acc, product0);
+#   endif
+    }
+
+    [[maybe_unused]] static void m128_add_dpbusd_epi32x2(
+        __m128i& acc,
+        __m128i a0, __m128i b0,
+        __m128i a1, __m128i b1) {
+
+#   if defined (USE_INLINE_ASM)
+      __m128i tmp0 = _mm_maddubs_epi16(a0, b0);
+      __m128i tmp1 = _mm_maddubs_epi16(a1, b1);
+      asm(
+          "paddsw     %[tmp1], %[tmp0]\n\t"
+          "pmaddwd    %[ones], %[tmp0]\n\t"
+          "paddd      %[tmp0], %[acc]\n\t"
+          : [acc]"+v"(acc), [tmp0]"+&v"(tmp0)
+          : [tmp1]"v"(tmp1), [ones]"v"(_mm_set1_epi16(1))
+      );
+#   else
+      __m128i product0 = _mm_maddubs_epi16(a0, b0);
+      __m128i product1 = _mm_maddubs_epi16(a1, b1);
+      product0 = _mm_adds_epi16(product0, product1);
+      product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
+      acc = _mm_add_epi32(acc, product0);
+#   endif
+    }
+
+#endif
+
+#if defined (USE_NEON)
+
+    [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
+#   if USE_NEON >= 8
+      return vaddvq_s32(s);
+#   else
+      return s[0] + s[1] + s[2] + s[3];
+#   endif
+    }
+
+    [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
+      return neon_m128_reduce_add_epi32(sum) + bias;
+    }
+
+    [[maybe_unused]] static int32x4_t neon_m128_haddx4(
+        int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3,
+        int32x4_t bias) {
+
+      int32x4_t hsums {
+        neon_m128_reduce_add_epi32(sum0),
+        neon_m128_reduce_add_epi32(sum1),
+        neon_m128_reduce_add_epi32(sum2),
+        neon_m128_reduce_add_epi32(sum3)
+      };
+      return vaddq_s32(hsums, bias);
+    }
+
+    [[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2(
+        int32x4_t& acc,
+        int8x8_t a0, int8x8_t b0,
+        int8x8_t a1, int8x8_t b1) {
+
+      int16x8_t product = vmull_s8(a0, b0);
+      product = vmlal_s8(product, a1, b1);
+      acc = vpadalq_s16(acc, product);
+    }
+
+#endif
+
+}
+
+#endif // STOCKFISH_SIMD_H_INCLUDED
--- a/DroidFishApp/src/main/cpp/stockfish/syzygy/tbprobe.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/syzygy/tbprobe.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -472,8 +472,6 @@ TBTables TBTables;
 // If the corresponding file exists two new objects TBTable<WDL> and TBTable<DTZ>
 // are created and added to the lists and hash table. Called at init time.
 void TBTables::add(const std::vector<PieceType>& pieces) {
-    if (sizeof(char*) < 8 && pieces.size() >= 6)
-        return; // Not enough address space to support 6-men TB on 32-bit OS

    std::string code;

@ -771,7 +769,7 @@ Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* resu
        goto encode_remaining; // With pawns we have finished special treatments
    }

-    // In positions withouth pawns, we further flip the squares to ensure leading
+    // In positions without pawns, we further flip the squares to ensure leading
    // piece is below RANK_5.
    if (rank_of(squares[0]) > RANK_4)
        for (int i = 0; i < size; ++i)
@ -814,7 +812,7 @@ Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* resu
    // Rs "together" in 62 * 61 / 2 ways (we divide by 2 because rooks can be
    // swapped and still get the same position.)
    //
-    // In case we have at least 3 unique pieces (inlcuded kings) we encode them
+    // In case we have at least 3 unique pieces (included kings) we encode them
    // together.
    if (entry->hasUniquePieces) {

@ -829,7 +827,7 @@ Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* resu
                   + (squares[1] - adjust1)) * 62
                   +  squares[2] - adjust2;

-        // First piece is on a1-h8 diagonal, second below: map this occurence to
+        // First piece is on a1-h8 diagonal, second below: map this occurrence to
        // 6 to differentiate from the above case, rank_of() maps a1-d4 diagonal
        // to 0...3 and finally MapB1H1H7[] maps the b1-h1-h7 triangle to 0..27.
        else if (off_A1H8(squares[1]))
@ -859,7 +857,7 @@ encode_remaining:
    idx *= d->groupIdx[0];
    Square* groupSq = squares + d->groupLen[0];

-    // Encode remainig pawns then pieces according to square, in ascending order
+    // Encode remaining pawns then pieces according to square, in ascending order
    bool remainingPawns = entry->hasPawns && entry->pawnCount[1];

    while (d->groupLen[++next])
@ -887,7 +885,7 @@ encode_remaining:

 // Group together pieces that will be encoded together. The general rule is that
 // a group contains pieces of same type and color. The exception is the leading
-// group that, in case of positions withouth pawns, can be formed by 3 different
+// group that, in case of positions without pawns, can be formed by 3 different
 // pieces (default) or by the king pair when there is not a unique piece apart
 // from the kings. When there are pawns, pawns are always first in pieces[].
 //
@ -919,7 +917,7 @@ void set_groups(T& e, PairsData* d, int order[], File f) {
    //
    // This ensures unique encoding for the whole position. The order of the
    // groups is a per-table parameter and could not follow the canonical leading
-    // pawns/pieces -> remainig pawns -> remaining pieces. In particular the
+    // pawns/pieces -> remaining pawns -> remaining pieces. In particular the
    // first group is at order[0] position and the remaining pawns, when present,
    // are at order[1] position.
    bool pp = e.hasPawns && e.pawnCount[1]; // Pawns on both sides
@ -939,7 +937,7 @@ void set_groups(T& e, PairsData* d, int order[], File f) {
            d->groupIdx[1] = idx;
            idx *= Binomial[d->groupLen[1]][48 - d->groupLen[0]];
        }
-        else // Remainig pieces
+        else // Remaining pieces
        {
            d->groupIdx[next] = idx;
            idx *= Binomial[d->groupLen[next]][freeSquares];
@ -949,7 +947,7 @@ void set_groups(T& e, PairsData* d, int order[], File f) {
    d->groupIdx[n] = idx;
 }

-// In Recursive Pairing each symbol represents a pair of childern symbols. So
+// In Recursive Pairing each symbol represents a pair of children symbols. So
 // read d->btree[] symbols data and expand each one in his left and right child
 // symbol until reaching the leafs that represent the symbol value.
 uint8_t set_symlen(PairsData* d, Sym s, std::vector<bool>& visited) {
@ -1319,7 +1317,7 @@ void Tablebases::init(const std::string& paths) {
    for (auto p : bothOnDiagonal)
        MapKK[p.first][p.second] = code++;

-    // Binomial[] stores the Binomial Coefficents using Pascal rule. There
+    // Binomial[] stores the Binomial Coefficients using Pascal rule. There
    // are Binomial[k][n] ways to choose k elements from a set of n elements.
    Binomial[0][0] = 1;

@ -1339,7 +1337,7 @@ void Tablebases::init(const std::string& paths) {
    for (int leadPawnsCnt = 1; leadPawnsCnt <= 5; ++leadPawnsCnt)
        for (File f = FILE_A; f <= FILE_D; ++f)
        {
-            // Restart the index at every file because TB table is splitted
+            // Restart the index at every file because TB table is split
            // by file, so we can reuse the same index for different files.
            int idx = 0;

--- a/DroidFishApp/src/main/cpp/stockfish/syzygy/tbprobe.h
+++ b/DroidFishApp/src/main/cpp/stockfish/syzygy/tbprobe.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -38,7 +38,7 @@ enum WDLScore {
 // Possible states after a probing operation
 enum ProbeState {
    FAIL              =  0, // Probe failed (missing file table)
-    OK                =  1, // Probe succesful
+    OK                =  1, // Probe successful
    CHANGE_STM        = -1, // DTZ should check the other side
    ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
--- a/DroidFishApp/src/main/cpp/stockfish/thread.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/thread.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -59,7 +59,6 @@ void Thread::clear() {

  counterMoves.fill(MOVE_NONE);
  mainHistory.fill(0);
-  lowPlyHistory.fill(0);
  captureHistory.fill(0);

  for (bool inCheck : { false, true })
@ -67,7 +66,7 @@ void Thread::clear() {
      {
          for (auto& to : continuationHistory[inCheck][c])
                for (auto& h : to)
-                      h->fill(0);
+                      h->fill(-71);
          continuationHistory[inCheck][c][NO_PIECE][0]->fill(Search::CounterMovePruneThreshold - 1);
      }
 }
@ -162,6 +161,7 @@ void ThreadPool::clear() {

  main()->callsCnt = 0;
  main()->bestPreviousScore = VALUE_INFINITE;
+  main()->bestPreviousAverageScore = VALUE_INFINITE;
  main()->previousTimeReduction = 1.0;
 }

--- a/DroidFishApp/src/main/cpp/stockfish/thread.h
+++ b/DroidFishApp/src/main/cpp/stockfish/thread.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -60,18 +60,19 @@ public:
  Pawns::Table pawnsTable;
  Material::Table materialTable;
  size_t pvIdx, pvLast;
-  uint64_t ttHitAverage;
+  RunningAverage complexityAverage;
+  std::atomic<uint64_t> nodes, tbHits, bestMoveChanges;
  int selDepth, nmpMinPly;
  Color nmpColor;
-  std::atomic<uint64_t> nodes, tbHits, bestMoveChanges;
+  Value bestValue, optimism[COLOR_NB];

  Position rootPos;
  StateInfo rootState;
  Search::RootMoves rootMoves;
-  Depth rootDepth, completedDepth;
+  Depth rootDepth, completedDepth, depth;
+  Value rootDelta;
  CounterMoveHistory counterMoves;
  ButterflyHistory mainHistory;
-  LowPlyHistory lowPlyHistory;
  CapturePieceToHistory captureHistory;
  ContinuationHistory continuationHistory[2][2];
  Score trend;
@ -89,6 +90,7 @@ struct MainThread : public Thread {

  double previousTimeReduction;
  Value bestPreviousScore;
+  Value bestPreviousAverageScore;
  Value iterValue[4];
  int callsCnt;
  bool stopOnPonderhit;
--- a/DroidFishApp/src/main/cpp/stockfish/thread_win32_osx.h
+++ b/DroidFishApp/src/main/cpp/stockfish/thread_win32_osx.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/timeman.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/timeman.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -68,6 +68,9 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
  TimePoint timeLeft =  std::max(TimePoint(1),
      limits.time[us] + limits.inc[us] * (mtg - 1) - moveOverhead * (2 + mtg));

+  // Use extra time with larger increments
+  double optExtra = std::clamp(1.0 + 12.0 * limits.inc[us] / limits.time[us], 1.0, 1.12);
+
  // A user may scale time usage by setting UCI option "Slow Mover"
  // Default is 100 and changing this value will probably lose elo.
  timeLeft = slowMover * timeLeft / 100;
@ -78,15 +81,16 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
  if (limits.movestogo == 0)
  {
      optScale = std::min(0.0084 + std::pow(ply + 3.0, 0.5) * 0.0042,
-                           0.2 * limits.time[us] / double(timeLeft));
+                           0.2 * limits.time[us] / double(timeLeft))
+                 * optExtra;
      maxScale = std::min(7.0, 4.0 + ply / 12.0);
  }

  // x moves in y seconds (+ z increment)
  else
  {
-      optScale = std::min((0.8 + ply / 128.0) / mtg,
-                            0.8 * limits.time[us] / double(timeLeft));
+      optScale = std::min((0.88 + ply / 116.4) / mtg,
+                            0.88 * limits.time[us] / double(timeLeft));
      maxScale = std::min(6.3, 1.5 + 0.11 * mtg);
  }

--- a/DroidFishApp/src/main/cpp/stockfish/timeman.h
+++ b/DroidFishApp/src/main/cpp/stockfish/timeman.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/tt.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/tt.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -40,9 +40,9 @@ void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev)
      move16 = (uint16_t)m;

  // Overwrite less valuable entries (cheapest checks first)
-  if (b == BOUND_EXACT
+  if (   b == BOUND_EXACT
      || (uint16_t)k != key16
-      || d - DEPTH_OFFSET > depth8 - 4)
+      || d - DEPTH_OFFSET + 2 * pv > depth8 - 4)
  {
      assert(d > DEPTH_OFFSET);
      assert(d < 256 + DEPTH_OFFSET);
--- a/DroidFishApp/src/main/cpp/stockfish/tt.h
+++ b/DroidFishApp/src/main/cpp/stockfish/tt.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/tune.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/tune.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/tune.h
+++ b/DroidFishApp/src/main/cpp/stockfish/tune.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -84,7 +84,7 @@ class Tune {

  static Tune& instance() { static Tune t; return t; } // Singleton

-  // Use polymorphism to accomodate Entry of different types in the same vector
+  // Use polymorphism to accommodate Entry of different types in the same vector
  struct EntryBase {
    virtual ~EntryBase() = default;
    virtual void init_option() = 0;
--- a/DroidFishApp/src/main/cpp/stockfish/types.h
+++ b/DroidFishApp/src/main/cpp/stockfish/types.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -465,10 +465,6 @@ constexpr Move make_move(Square from, Square to) {
  return Move((from << 6) + to);
 }

-constexpr Move reverse_move(Move m) {
-  return make_move(to_sq(m), from_sq(m));
-}
-
 template<MoveType T>
 constexpr Move make(Square from, Square to, PieceType pt = KNIGHT) {
  return Move(T + ((pt - KNIGHT) << 12) + (from << 6) + to);
--- a/DroidFishApp/src/main/cpp/stockfish/uci.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/uci.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -207,8 +207,8 @@ namespace {
     // Coefficients of a 3rd order polynomial fit based on fishtest data
     // for two parameters needed to transform eval to the argument of a
     // logistic function.
-     double as[] = {-3.68389304,  30.07065921, -60.52878723, 149.53378557};
-     double bs[] = {-2.0181857,   15.85685038, -29.83452023,  47.59078827};
+     double as[] = {-1.17202460e-01, 5.94729104e-01, 1.12065546e+01, 1.22606222e+02};
+     double bs[] = {-1.79066759,  11.30759193, -17.43677612,  36.47147479};
     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];

--- a/DroidFishApp/src/main/cpp/stockfish/uci.h
+++ b/DroidFishApp/src/main/cpp/stockfish/uci.h
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
--- a/DroidFishApp/src/main/cpp/stockfish/ucioption.cpp
+++ b/DroidFishApp/src/main/cpp/stockfish/ucioption.cpp
@ -1,6 +1,6 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)

  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -164,7 +164,7 @@ Option& Option::operator=(const string& v) {

  assert(!type.empty());

-  if (   (type != "button" && v.empty())
+  if (   (type != "button" && type != "string" && v.empty())
      || (type == "check" && v != "true" && v != "false")
      || (type == "spin" && (stof(v) < min || stof(v) > max)))
      return *this;
--- a/DroidFishApp/src/main/java/org/petero/droidfish/engine/InternalStockFish.java
+++ b/DroidFishApp/src/main/java/org/petero/droidfish/engine/InternalStockFish.java
@ -36,7 +36,7 @@ import org.petero.droidfish.EngineOptions;

 /** Stockfish engine running as process, started from assets resource. */
 public class InternalStockFish extends ExternalEngine {
-    private static final String defaultNet = "nn-3475407dc199.nnue";
+    private static final String defaultNet = "nn-6877cd24400e.nnue";
    private static final String netOption = "evalfile";
    private File defaultNetFile; // To get the full path of the copied default network file