From 1ca31a78fba99d1e1712119b8d2a1395dfa3492e Mon Sep 17 00:00:00 2001
From: Christoph Hasse <hasse.christoph@outlook.de>
Date: Mon, 6 Apr 2020 17:53:41 +0200
Subject: [PATCH 1/3] refactor SciFiTrackForwarding

---
 .../src/SciFiTrackForwarding.cpp              | 1925 +++++++----------
 1 file changed, 798 insertions(+), 1127 deletions(-)
diff --git a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
index cc618bab453..e8ee273a399 100644
--- a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
+++ b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
@@ -52,28 +52,51 @@
  *     doublets of hits are searched in the S2L0 --> S2L3 layers,
  *     they are then prolonged to S3 and S1.
  *
- *   Most important FIXMEs:
- * - Whenever a lighter eventmodel is used, it will be a good idea to vectorize the linear and binary search for further
- *   speedups
  *
- * - The currently employed linear fit leaves a couple things to be desired. Thus this should at some point be replaced
- *   by a better solution. Mainly we don't get a real y-slope or offset since we only fit x-hits, but it's resolution
- *   for high momentum tracks could be better which leads to a momentum resolution of up to ~1.8% while something closer
- *   to ~1% should be possible
+ * FIXME
+ * - The currently employed linear fit only uses x-hits.
+ *   This leads to incresing momentum resolution for higher momentum tracks.
  */
 
 ///////////////////////////////////////////////////////////////////////////////
 // anonymous namespace for helper functions local to this compilation unit
 ///////////////////////////////////////////////////////////////////////////////
 namespace {
+
+  using scalar = SIMDWrapper::scalar::types;
+  using sF     = scalar::float_v;
+  using sI     = scalar::int_v;
+
+  using vec = SIMDWrapper::avx2::types;
+  using vF  = vec::float_v;
+  using vI  = vec::int_v;
+
+  // stolen from Niklas
+  // c++20's remove_cvref
+  template <typename T>
+  struct remove_cvref {
+    using type = std::remove_cv_t<std::remove_reference_t<T>>;
+  };
+
+  template <typename T>
+  using remove_cvref_t = typename remove_cvref<T>::type;
+
+  template <typename T>
+  auto to_std_array( T&& some_v ) {
+    if constexpr ( std::is_same_v<remove_cvref_t<T>, vF> ) {
+      std::array<float, vec::size> tmp;
+      some_v.store( tmp.data() );
+      return tmp;
+    } else if ( std::is_same_v<remove_cvref_t<T>, vI> ) {
+      std::array<int, vec::size> tmp;
+      some_v.store( tmp.data() );
+      return tmp;
+    }
+  }
+
   using TracksUT = LHCb::Pr::Upstream::Tracks;
   using TracksFT = LHCb::Pr::Forward::Tracks;
 
-  // average z-position of the kink position inside the magnet (tuned on MC),
-  // for S3 and S2
-  float constexpr zPosKinkMagnet_S3 = 5282.5f;
-  float constexpr zPosKinkMagnet_S2 = 5195.f;
-
   // constants for extrapolation polynomials from x hit in S3L0
   // to the corresponding x hit in other stations and layers
   double constexpr ExtFacS3_alpha = 1.4706824654297932e-5;
@@ -90,11 +113,8 @@ namespace {
     std::array<float, 24> dZdYFiber{std::numeric_limits<float>::signaling_NaN()};
     std::array<float, 12> dXdYFiber{std::numeric_limits<float>::signaling_NaN()};
 
-    // polynomial coefficients for the seeding starting from S3
-    std::array<float, 24> ExtFacS3{std::numeric_limits<float>::signaling_NaN()};
-
-    // polynomial coefficients for the seeding starting from S2
-    std::array<float, 24> ExtFacS2{std::numeric_limits<float>::signaling_NaN()};
+    // polynomial coefficients for extrapolations
+    std::array<float, 48> ext_coeff{std::numeric_limits<float>::signaling_NaN()};
 
     PrFTZoneHandler zoneHandler;
 
@@ -113,41 +133,40 @@ namespace {
       // get the uv layer fiber slopes to cache them for later usage
       for ( int i{0}; i < 6; ++i ) {
         // the first 6 values are the bottom uv layers
-        dXdYFiber[i] = zoneHandler.zone( PrFTInfo::stereoZones[2 * i] ).dxDy();
+        dXdYFiber[i] = -zoneHandler.zone( PrFTInfo::stereoZones[2 * i] ).dxDy();
 
         // the last 6 values are the top uv layers
-        dXdYFiber[6 + i] = zoneHandler.zone( PrFTInfo::stereoZones[2 * i + 1] ).dxDy();
+        dXdYFiber[6 + i] = -zoneHandler.zone( PrFTInfo::stereoZones[2 * i + 1] ).dxDy();
       }
 
       // calculate some coefficients for the extrapolation into other stations and put them in a cache
       int cnt = 0;
 
-      // used in the second iteration, seeding from S2
-      for ( int i : {0, 3, 8, 11} ) {
-        int const from = i < 4 ? 4 : 7;
-
-        ExtFacS2[cnt]        = ExtFacS2_alpha * std::pow( LayerZPos[i] - LayerZPos[from], 2 );
-        ExtFacS2[12 + cnt++] = ExtFacS2_alpha * std::pow( LayerZPos[12 + i] - LayerZPos[from + 12], 2 );
+      // used in the first iteration, seeding from S3
+      for ( int i : {0, 3, 4, 7} ) {
+        ext_coeff[cnt]        = ExtFacS3_alpha * std::pow( LayerZPos[i] - LayerZPos[8], 2 );
+        ext_coeff[12 + cnt++] = ExtFacS3_alpha * std::pow( LayerZPos[12 + i] - LayerZPos[20], 2 );
 
-        ExtFacS2[cnt]        = ExtFacS2_beta * std::pow( LayerZPos[i] - LayerZPos[from], 3 );
-        ExtFacS2[12 + cnt++] = ExtFacS2_beta * std::pow( LayerZPos[12 + i] - LayerZPos[from + 12], 3 );
+        ext_coeff[cnt]        = ExtFacS3_beta * std::pow( LayerZPos[i] - LayerZPos[8], 3 );
+        ext_coeff[12 + cnt++] = ExtFacS3_beta * std::pow( LayerZPos[12 + i] - LayerZPos[20], 3 );
 
-        ExtFacS2[cnt]        = ExtFacS2_gamma * std::pow( LayerZPos[i] - LayerZPos[from], 2 );
-        ExtFacS2[12 + cnt++] = ExtFacS2_gamma * std::pow( LayerZPos[12 + i] - LayerZPos[from + 12], 2 );
+        ext_coeff[cnt]        = ExtFacS3_gamma * std::pow( LayerZPos[i] - LayerZPos[8], 2 );
+        ext_coeff[12 + cnt++] = ExtFacS3_gamma * std::pow( LayerZPos[12 + i] - LayerZPos[20], 2 );
       }
 
-      cnt = 0;
+      cnt = 24;
+      // used in the second iteration, seeding from S2
+      for ( int i : {0, 3, 8, 11} ) {
+        int const from = i < 4 ? 4 : 7;
 
-      // used in the first iteration, seeding from S3
-      for ( int i : {0, 3, 4, 7} ) {
-        ExtFacS3[cnt]        = ExtFacS3_alpha * std::pow( LayerZPos[i] - LayerZPos[8], 2 );
-        ExtFacS3[12 + cnt++] = ExtFacS3_alpha * std::pow( LayerZPos[12 + i] - LayerZPos[20], 2 );
+        ext_coeff[cnt]        = ExtFacS2_alpha * std::pow( LayerZPos[i] - LayerZPos[from], 2 );
+        ext_coeff[12 + cnt++] = ExtFacS2_alpha * std::pow( LayerZPos[12 + i] - LayerZPos[from + 12], 2 );
 
-        ExtFacS3[cnt]        = ExtFacS3_beta * std::pow( LayerZPos[i] - LayerZPos[8], 3 );
-        ExtFacS3[12 + cnt++] = ExtFacS3_beta * std::pow( LayerZPos[12 + i] - LayerZPos[20], 3 );
+        ext_coeff[cnt]        = ExtFacS2_beta * std::pow( LayerZPos[i] - LayerZPos[from], 3 );
+        ext_coeff[12 + cnt++] = ExtFacS2_beta * std::pow( LayerZPos[12 + i] - LayerZPos[from + 12], 3 );
 
-        ExtFacS3[cnt]        = ExtFacS3_gamma * std::pow( LayerZPos[i] - LayerZPos[8], 2 );
-        ExtFacS3[12 + cnt++] = ExtFacS3_gamma * std::pow( LayerZPos[12 + i] - LayerZPos[20], 2 );
+        ext_coeff[cnt]        = ExtFacS2_gamma * std::pow( LayerZPos[i] - LayerZPos[from], 2 );
+        ext_coeff[12 + cnt++] = ExtFacS2_gamma * std::pow( LayerZPos[12 + i] - LayerZPos[from + 12], 2 );
       }
     }
   };
@@ -197,89 +216,60 @@ namespace {
     int m_size;
   };
 
-  // fast_upper_bound from Arthur
+  // modified fast_upper_bound from Arthur
   template <class T>
-  int fast_upper_bound( const span<const T>& vec, const T value ) {
-    int size = vec.size();
+  int get_closest_hit_idx_binary( span<const T> const& hitvec, T const value ) {
+    int size = hitvec.size();
     int low  = 0;
 
     int half1 = size / 2;
-    low += ( vec[low + half1] <= value ) * ( size - half1 );
+    low += ( hitvec[low + half1] <= value ) * ( size - half1 );
     size      = half1;
     int half2 = size / 2;
-    low += ( vec[low + half2] <= value ) * ( size - half2 );
+    low += ( hitvec[low + half2] <= value ) * ( size - half2 );
     size      = half2;
     int half3 = size / 2;
-    low += ( vec[low + half3] <= value ) * ( size - half3 );
+    low += ( hitvec[low + half3] <= value ) * ( size - half3 );
     size      = half3;
     int half4 = size / 2;
-    low += ( vec[low + half4] <= value ) * ( size - half4 );
+    low += ( hitvec[low + half4] <= value ) * ( size - half4 );
     size      = half4;
     int half5 = size / 2;
-    low += ( vec[low + half5] <= value ) * ( size - half5 );
+    low += ( hitvec[low + half5] <= value ) * ( size - half5 );
     size = half5;
 
     do {
       int half = size / 2;
-      low += ( vec[low + half] <= value ) * ( size - half );
+      low += ( hitvec[low + half] <= value ) * ( size - half );
       size = half;
     } while ( size > 0 );
 
-    return low;
-  }
-
-  [[using gnu: const, hot]] int getUpperBoundBinary( SciFiTrackForwardingHits::hits_t const& vec, int const start,
-                                                     int const end, float const val ) noexcept {
-    return start + fast_upper_bound( {vec.data() + start, end - start}, val );
+    return low - ( std::abs( hitvec[low] - value ) >= std::abs( hitvec[low - 1] - value ) );
   }
 
   // Simple linear search
   // since my vector container is padded by sentinels I can get rid of a check in the loop and simply advance until
   // condition is met
-  [[using gnu: const, hot]] int getUpperBoundLinear( SciFiTrackForwardingHits::hits_t const& vec, int start,
-                                                     float const val ) noexcept {
-
-    using simd = SIMDWrapper::avx2::types;
-    using F    = simd::float_v;
+  [[using gnu: hot, const]] inline int get_closest_hit_idx( SciFiTrackForwardingHits::hits_t const& hitvec, int start,
+                                                            float const val ) noexcept {
 
     // vector of comparison value
-    F vval{val};
+    vF vval{val};
     for ( ;; ) {
       // load vector of data
-      F vvec{vec.data() + start};
+      vF vvec{hitvec.data() + start};
       // comparison mask with true/false values
       auto const mask = vval > vvec;
-      // sum of true values
-      auto const sum{popcount( mask )};
+
       // if less than simd::size whe can stop
-      if ( sum < static_cast<int>( simd::size ) ) return start + sum;
+      if ( !all( mask ) ) {
+        start += popcount( mask );
+        break;
+      }
       // all values where smaller thus do next iteration
-      start += simd::size;
+      start += vec::size;
     }
-  }
-
-  // shared implementation between linear and binary hit search
-  [[using gnu: const, hot]] std::tuple<int, float>
-  __Impl_SearchLayerForHit( SciFiTrackForwardingHits::hits_t const& hits, int const upperIdx, float const prediction ) {
-    bool dIdx = std::abs( hits[upperIdx] - prediction ) >= std::abs( hits[upperIdx - 1] - prediction );
-    return {upperIdx - dIdx, std::abs( prediction - hits[upperIdx - dIdx] )};
-  }
-
-  // given the hits of a station are in the vector hits+StartOffset ---- hits+EndOffset
-  // this will find the closest x-hit to the x-prediction using binary search
-  [[using gnu: const, hot]] std::tuple<int, float>
-  SearchLayerForHitBinary( SciFiTrackForwardingHits::hits_t const& hits, int const StartOffset, int const EndOffset,
-                           float const prediction ) {
-    int const upperIdx = getUpperBoundBinary( hits, StartOffset, EndOffset, prediction );
-    return __Impl_SearchLayerForHit( hits, upperIdx, prediction );
-  }
-
-  // same as above, although we use a linear search, EndOffset is not needed as the sentinel value ends the range
-  [[using gnu: const, hot]] std::tuple<int, float>
-  SearchLayerForHitLinear( SciFiTrackForwardingHits::hits_t const& hits, int const StartOffset,
-                           float const prediction ) {
-    int const upperIdx = getUpperBoundLinear( hits, StartOffset, prediction );
-    return __Impl_SearchLayerForHit( hits, upperIdx, prediction );
+    return start - ( hitvec[start] - val >= val - hitvec[start - 1] );
   }
 
   // helper function to fill a vector with some information later used in the linear fit
@@ -292,6 +282,176 @@ namespace {
     fitvec[fitcnt++] = residual;
   }
 
+  std::tuple<float, float, float> simple_fit( std::array<float, 24> const& fitvec, int const fitcnt, float const newtx,
+                                              float const firstXhit ) {
+    // quick linear fit in x0 and tx, curvature terms remain fixed
+    float s0  = 0;
+    float sz  = 0;
+    float sz2 = 0;
+    float sd  = 0;
+    float sdz = 0;
+
+    for ( int idx{0}; idx < fitcnt; idx += 4 ) {
+      // for now the weight is just 1
+      s0 += 1;
+      sz += fitvec[idx + 1];
+      sz2 += fitvec[idx + 1] * fitvec[idx + 1];
+      sd += fitvec[idx + 3];
+      sdz += fitvec[idx + 3] * fitvec[idx + 1];
+    }
+
+    float const den     = 1.f / ( sz * sz - 6 * sz2 );
+    float const xoff    = ( sdz * sz - sd * sz2 ) * den;
+    float const txoff   = ( sd * sz - 6 * sdz ) * den;
+    float const finaltx = newtx + txoff;
+    float const newx0   = firstXhit + xoff;
+
+    // calculate a chi2 after the fit
+    float chi2 = ( fitvec[0] - newx0 ) * ( fitvec[0] - newx0 );
+    for ( int idx{4}; idx < fitcnt; idx += 4 ) {
+      // for now the weight is just 1
+      float const diff = fitvec[idx] - ( newx0 + fitvec[idx + 1] * finaltx + fitvec[idx + 2] );
+      chi2 += diff * diff;
+    }
+    return {finaltx, newx0, chi2};
+  }
+
+  template <bool is_first_loop>
+  float calc_mom( float const finaltx, float const endv_tx, float const endv_ty2, float const mag_scale,
+                  float const factor ) {
+
+    // determine momentum * charge estimate, polynomial was fitted on MC
+    const float finaltx2 = finaltx * finaltx;
+    const float coef =
+        ( MomentumParams[0] + finaltx2 * ( MomentumParams[1] + MomentumParams[2] * finaltx2 ) +
+          MomentumParams[3] * finaltx * endv_tx + endv_ty2 * ( MomentumParams[4] + MomentumParams[5] * endv_ty2 ) +
+          MomentumParams[6] * endv_tx * endv_tx );
+
+    if constexpr ( is_first_loop ) {
+      return mag_scale * coef / ( finaltx - endv_tx ) + factor * MomentumParams[7];
+    } else {
+      return 0.9818f * ( mag_scale * coef / ( finaltx - endv_tx ) + factor * MomentumParams[7] );
+    }
+  }
+
+  float calc_quality( float const endv_pt, float const factor, float const firstXhit, float const straighExt,
+                      float const endv_pq, float const endv_qp, float const candPQ, float const wrong_sign_pt,
+                      float const chi2, float const endv_txy, float const S3UVDelta, float const S2UVDelta,
+                      float const S1UVDelta, int const numuvhits, int const numxhits ) {
+
+    // the condition evaluates to true if the track is a charge flipped one
+    float deltaMom = ( endv_pt > wrong_sign_pt and ( factor * ( firstXhit - straighExt ) < 0 ) )
+                         ? 3.f * std::abs( endv_qp * ( endv_pq + candPQ ) )
+                         : std::abs( endv_qp * ( endv_pq - candPQ ) );
+
+    // counteract worse momentum resolution at higher momenutm by scaling down
+    // the observed delta
+    if ( std::abs( candPQ ) > 40000 ) deltaMom *= 40000.f / std::abs( candPQ );
+
+    // For now a linear discriminant seems to do well enough. But if needed this
+    // could be substituted by a neural net for better rejection of ghosts
+    return LDAParams[0] * approx_log( sF( 1.f + deltaMom ) ).cast() +
+           LDAParams[1] * approx_log( sF( 1.f + chi2 ) ).cast() +
+           LDAParams[2] *
+               approx_log( sF( std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy * endv_txy ) ) ).cast() +
+           LDAParams[3] * approx_log( sF( S3UVDelta ) ).cast() + LDAParams[4] * approx_log( sF( S2UVDelta ) ).cast() +
+           LDAParams[5] * approx_log( sF( S1UVDelta ) ).cast() + LDAParams[6] * static_cast<float>( numuvhits ) +
+           LDAParams[7] * static_cast<float>( numxhits );
+  }
+
+  struct config_loop_1 {
+    // average z-position of the kink position inside the magnet (tuned on MC),
+    float static constexpr zPosKinkMagnet = 5282.5f;
+    // idx of first layer of seed station in Z-position array
+    int static constexpr seed_first_layer_z_idx{8};
+    int static constexpr next_first_layer_z_idx{4};
+    int static constexpr last_first_layer_z_idx{0};
+    // the array of extrapolation coefficients is 48 entries long
+    // first 24 are for the first loop 24-47 for the second loop
+    int static constexpr ext_coeff_start{0};
+    // Config 1 starts seeding in last station
+    // indices to select layers of third station
+    int static constexpr seed_station_idx_first{4};
+    int static constexpr seed_station_idx_last{5};
+    // station 2 is the first station to be extrapolated into
+    int static constexpr next_station_idx_first{2};
+    int static constexpr next_station_idx_last{3};
+    float static constexpr doublet_curve_c0{-0.231615f};
+    float static constexpr doublet_curve_c1{33.1256f};
+    float static constexpr doublet_curve_c2{10.4693f};
+    float static constexpr seed_scale_ycorr{1.005f};
+    float static constexpr next_scale_ycorr{0.83f};
+    float static constexpr last_scale_ycorr{0.615f};
+    // cut window parameters
+    float static constexpr doublet_win_slope = 4.25f;
+    float static constexpr doublet_win_off   = 0.48f;
+    float static constexpr doublet_win_max   = 3.f;
+    float static constexpr seed_uv_win_slope = 10.f;
+    float static constexpr seed_uv_win_off   = .75f;
+
+    float static constexpr next_l0_win_slope = 2.f;
+    float static constexpr next_l0_win_off   = 2.f;
+    float static constexpr next_l3_win_slope = 1.f;
+    float static constexpr next_l3_win_off   = 1.8f;
+    float static constexpr next_uv_win_slope = 10.f;
+    float static constexpr next_uv_win_off   = .75f;
+
+    float static constexpr last_l0_win_slope = 2.6f;
+    float static constexpr last_l0_win_off   = 3.6f;
+    float static constexpr last_l3_win_slope = 2.6f;
+    float static constexpr last_l3_win_off   = 3.5f;
+    float static constexpr last_uv_win_slope = 7.f;
+    float static constexpr last_uv_win_off   = 1.f;
+
+    int static constexpr last_station_idx_first{0};
+    int static constexpr last_station_idx_last{1};
+  };
+
+  struct config_loop_2 {
+    float static constexpr zPosKinkMagnet = 5195.f;
+    int static constexpr seed_first_layer_z_idx{4};
+    int static constexpr next_first_layer_z_idx{8};
+    int static constexpr last_first_layer_z_idx{0};
+    // the array of extrapolation coefficients is 48 entries long
+    // first 24 are for the first loop 24-47 for the second loop
+    int static constexpr ext_coeff_start{24};
+    // Config 1 starts seeding in last station
+    // indices to select layers of third station
+    int static constexpr seed_station_idx_first{2};
+    int static constexpr seed_station_idx_last{3};
+    // station 2 is the first station to be extrapolated into
+    int static constexpr next_station_idx_first{4};
+    int static constexpr next_station_idx_last{5};
+    float static constexpr doublet_curve_c0{-1.050f};
+    float static constexpr doublet_curve_c1{17.74f};
+    float static constexpr doublet_curve_c2{20.62};
+    float static constexpr seed_scale_ycorr{0.83f};
+    float static constexpr next_scale_ycorr{1.005f};
+    float static constexpr last_scale_ycorr{0.615f};
+    // cut window parameters
+    float static constexpr doublet_win_slope = 4.3f;
+    float static constexpr doublet_win_off   = 0.6f;
+    float static constexpr doublet_win_max   = 3.2f;
+    float static constexpr seed_uv_win_slope = 10.f;
+    float static constexpr seed_uv_win_off   = 1.f;
+
+    float static constexpr next_l0_win_slope = 1.5f;
+    float static constexpr next_l0_win_off   = 2.3f;
+    float static constexpr next_l3_win_slope = 1.5f;
+    float static constexpr next_l3_win_off   = 1.5f;
+    float static constexpr next_uv_win_slope = 10.f;
+    float static constexpr next_uv_win_off   = 1.f;
+
+    float static constexpr last_l0_win_slope = 2.5f;
+    float static constexpr last_l0_win_off   = 2.f;
+    float static constexpr last_l3_win_slope = 1.5f;
+    float static constexpr last_l3_win_off   = 2.3f;
+    float static constexpr last_uv_win_slope = 7.5f;
+    float static constexpr last_uv_win_off   = 1.5f;
+    int static constexpr last_station_idx_first{0};
+    int static constexpr last_station_idx_last{1};
+  };
+
 } // namespace
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -334,32 +494,15 @@ public:
   TracksFT operator()( EventContext const&, SciFiTrackForwardingHits const&, TracksUT const&,
                        GeomCache const& ) const override;
 
-  mutable Gaudi::Accumulators::SummingCounter<unsigned> m_CounterOutput{this, "Created long tracks"};
-
-  mutable Gaudi::Accumulators::Counter<> m_CounterAccepted{this, "Accepted input tracks"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterS1{this, "Search S1"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterS1UV{this, "Search S1UV"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterS2{this, "Search S2"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterS2UV{this, "Search S2 UV"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterS3{this, "Search S3"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterS3UV{this, "Search S3UV"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterSecondLoop{this, "1-  Second loops"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterXDoubletsS3{this, "2-  X doublets in S3"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterXDoubletsS2_2ndloop{this, "2b- X doublets in S2, 2nd loop"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterXDoubletsS3S2{this, "3-  X doublets in S3 and S2"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterXDoubletsS3S2_2ndloop{this, "3b- X doublets in S3 and S2, 2nd loop"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterXDoubletsS3S2S1{this, "4-  X doublets in S3, S2 and S1"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterXDoubletsS3S2S1_2ndloop{this,
-                                                                          "4b- X doublets in S3, S2 and S1, 2nd loop"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterCandsBeforeQuality{this, "5-  Candidates before the quality cut"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterCandsBeforeQuality_2ndloop{
-      this, "5b- Candidates before the quality cut, 2nd loop"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterCandsAfterQuality{this, "6-  Candidates after the quality cut"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterCandsAfterQuality_2ndloop{
-      this, "6b- Candidates after the quality cut, 2nd loop"};
-  mutable Gaudi::Accumulators::Counter<> m_CounterCands{this, "7-  Candidates"};
-
-  Gaudi::Property<float> p_preSelPT{this, "PreSelectionPT", 400.0f};
+  template <typename config>
+  [[using gnu: const, hot]] SciFiTrackForwardingCand const
+  find_track( GeomCache const&, SciFiTrackForwardingHits const&, int const, Vec3<SIMDWrapper::scalar::float_v> const&,
+              Vec3<SIMDWrapper::scalar::float_v> const&, float const, float const, float const, float const,
+              float const, float const, std::array<float, 12> const& ) const;
+
+  mutable Gaudi::Accumulators::SummingCounter<unsigned> m_counter_created_tracks{this, "Created long tracks"};
+  mutable Gaudi::Accumulators::Counter<>                m_counter_2_loop{this, "2nd Loop"};
+
   Gaudi::Property<float> p_minPt{this, "MinPt", 490.f};
 
   // if  a track exhibits a higher PT we open a symmetric search window
@@ -377,100 +520,14 @@ public:
   Gaudi::Property<float> p_LowerLimitSlope{this, "LowerLimit_slope", 1400.f};
   Gaudi::Property<float> p_LowerLimitMax{this, "LowerLimit_max", 600.f};
 
-  // station 2 x-doublet search window parameters
-  Gaudi::Property<float> p_DW_m{this, "DoubletWindow_slope", 4.25f};
-  Gaudi::Property<float> p_DW_b{this, "DoubletWindow_offset", 0.48f};
-  Gaudi::Property<float> p_DW_max{this, "DoubletWindow_max", 3.f};
-
-  // UV window station 3
-  Gaudi::Property<float> p_UV3W_m{this, "UV3Window_slope", 10.0f};
-  Gaudi::Property<float> p_UV3W_b{this, "UV3Window_offset", .75f};
-
-  // UV window station 2
-  Gaudi::Property<float> p_UV2W_m{this, "UV2Window_slope", 10.0f};
-  Gaudi::Property<float> p_UV2W_b{this, "UV2Window_offset", .75f};
-
-  // UV window station 1
-  Gaudi::Property<float> p_UV1W_m{this, "UV1Window_slope", 7.0f};
-  Gaudi::Property<float> p_UV1W_b{this, "UV1Window_offset", 1.f};
-
-  // S2L0 extrapolation window
-  Gaudi::Property<float> p_S2L0W_m{this, "S2L0Window_slope", 2.0f};
-  Gaudi::Property<float> p_S2L0W_b{this, "S2L0Window_offset", 2.0f};
-
-  // S2L3 extrapolation window
-  Gaudi::Property<float> p_S2L3W_m{this, "S2L3Window_slope", 1.0f};
-  Gaudi::Property<float> p_S2L3W_b{this, "S2L3Window_offset", 1.8f};
-
-  // S1L0 extrapolation window
-  Gaudi::Property<float> p_S1L0W_m{this, "S1L0Window_slope", 2.6f};
-  Gaudi::Property<float> p_S1L0W_b{this, "S1L0Window_offset", 3.6f};
-
-  // S1L3 extrapolation window
-  Gaudi::Property<float> p_S1L3W_m{this, "S1L3Window_slope", 2.6f};
-  Gaudi::Property<float> p_S1L3W_b{this, "S1L3Window_offset", 3.5f};
-
   // second loop for searching doublets
   Gaudi::Property<bool> p_SecondLoop{this, "SecondLoop", true};
 
-  // parameters for the momentum dependent upper error limit on the x-search window estimation
-  // Very large values of these parameters (x5-x6 wrt the values of the first iteration)
-  // do increase the efficiency for high momentum tracks of 0.5%-0.8% (with a +0.5% ghost rate price),
-  // but also slow down considerably the overall execution time by a 3x-4x factor
-  Gaudi::Property<float> p_UpperLimitOffset_2ndloop{this, "UpperLimit_offset_2ndloop", 100.f};
-  Gaudi::Property<float> p_UpperLimitSlope_2ndloop{this, "UpperLimit_slope_2ndloop", 2800.f};
-  Gaudi::Property<float> p_UpperLimitMax_2ndloop{this, "UpperLimit_max_2ndloop", 600.f};
-  Gaudi::Property<float> p_UpperLimitMin_2ndloop{this, "UpperLimit_min_2ndloop", 150.f};
-
-  // same as above for the lower limit
-  Gaudi::Property<float> p_LowerLimitOffset_2ndloop{this, "LowerLimit_offset_2ndloop", 50.f};
-  Gaudi::Property<float> p_LowerLimitSlope_2ndloop{this, "LowerLimit_slope_2ndloop", 1400.f};
-  Gaudi::Property<float> p_LowerLimitMax_2ndloop{this, "LowerLimit_max_2ndloop", 600.f};
-
-  // second loop: S2L3 extrapolation window
-  Gaudi::Property<float> p_S2L3W_m_2ndloop{this, "S2L3Window_slope_2ndloop", 4.3f};
-  Gaudi::Property<float> p_S2L3W_b_2ndloop{this, "S2L3Window_offset_2ndloop", 0.6f};
-  Gaudi::Property<float> p_DW_max_2ndloop{this, "DoubletWindow_max_2ndloop", 3.2f};
-
-  // second loop: UV window station 2
-  Gaudi::Property<float> p_UV2W_m_2ndloop{this, "UV2Window_slope_2ndloop", 10.f};
-  Gaudi::Property<float> p_UV2W_b_2ndloop{this, "UV2Window_offset_2ndloop", 1.f};
-
-  // second loop: S3L0 extrapolation window
-  Gaudi::Property<float> p_S3L0W_m_2ndloop{this, "S3L0Window_slope", 1.5f};
-  Gaudi::Property<float> p_S3L0W_b_2ndloop{this, "S3L0Window_offset", 2.3f};
-
-  // second loop: S3L3 extrapolation window
-  Gaudi::Property<float> p_S3L3W_m_2ndloop{this, "S3L3Window_slope", 1.5f};
-  Gaudi::Property<float> p_S3L3W_b_2ndloop{this, "S3L3Window_offset", 1.5f};
-
-  // second loop: UV window station 3
-  Gaudi::Property<float> p_UV3W_m_2ndloop{this, "UV3Window_slope_2ndloop", 10.f};
-  Gaudi::Property<float> p_UV3W_b_2ndloop{this, "UV3Window_offset_2ndloop", 1.f};
-
-  // second loop: S1L0 extrapolation window
-  Gaudi::Property<float> p_S1L0W_m_2ndloop{this, "S1L0Window_slope_2ndloop", 2.5f};
-  Gaudi::Property<float> p_S1L0W_b_2ndloop{this, "S1L0Window_offset_2ndloop", 2.f};
-
-  // second loop: S1L3 extrapolation window
-  Gaudi::Property<float> p_S1L3W_m_2ndloop{this, "S1L3Window_slope_2ndloop", 1.5f};
-  Gaudi::Property<float> p_S1L3W_b_2ndloop{this, "S1L3Window_offset_2ndloop", 2.3f};
-
-  // second loop: UV window station 1
-  Gaudi::Property<float> p_UV1W_m_2ndloop{this, "UV1Window_slope_2ndloop", 7.5f};
-  Gaudi::Property<float> p_UV1W_b_2ndloop{this, "UV1Window_offset_2ndloop", 1.5f};
-
-  // second loop: threshold of number of hits in S3 to reject a candidate
-  Gaudi::Property<int> p_numHitsThrS2_2ndloop{this, "numHitsThrS2_2ndloop", 4};
-
-  // second loop: threshold of number of x-hits in S1 and S2 to reject a candidate
-  // it is put in OR with the (numuvhits < 3) condition
-  Gaudi::Property<int> p_numXHitsThrS1S2_2ndloop{this, "numXHitsThrS1S2_2ndloop", 4};
-
 private:
   // this enables me to print things automagically space separated (less typing)
-  // also when enable is set to false, all of this code is removed which is nice for me to test what this actually costs
-  template <bool enable = true, typename... Args>
+  // also when enable is set to false, all of this code is removed which is nice for me to test what this actually
+  // costs
+  template <bool enable = false, typename... Args>
   void mydebug( Args&&... args ) const {
     if constexpr ( enable ) {
       if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) { ( ( debug() << std::forward<Args>( args ) << " " ), ... ) << endmsg; }
@@ -490,1047 +547,661 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac
 
   mydebug( "LayerZPos", cache.LayerZPos );
 
-  // get buffers for the counters
-  auto S1buffer                          = m_CounterS1.buffer();
-  auto S2buffer                          = m_CounterS2.buffer();
-  auto S3buffer                          = m_CounterS3.buffer();
-  auto S3UVbuffer                        = m_CounterS3UV.buffer();
-  auto S2UVbuffer                        = m_CounterS2UV.buffer();
-  auto S1UVbuffer                        = m_CounterS1UV.buffer();
-  auto CounterAcceptedBuffer             = m_CounterAccepted.buffer();
-  auto CounterSecondLoopBuffer           = m_CounterSecondLoop.buffer();
-  auto CounterCandsBuffer                = m_CounterCands.buffer();
-  auto CounterCandsBeforeQuality         = m_CounterCandsBeforeQuality.buffer();
-  auto CounterCandsBeforeQuality_2ndloop = m_CounterCandsBeforeQuality_2ndloop.buffer();
-  auto CounterCandsAfterQuality          = m_CounterCandsAfterQuality.buffer();
-  auto CounterCandsAfterQuality_2ndloop  = m_CounterCandsAfterQuality_2ndloop.buffer();
-  auto CounterXDoubletsS3                = m_CounterXDoubletsS3.buffer();
-  auto CounterXDoubletsS2_2ndloop        = m_CounterXDoubletsS2_2ndloop.buffer();
-  auto CounterXDoubletsS3S2              = m_CounterXDoubletsS3S2.buffer();
-  auto CounterXDoubletsS3S2_2ndloop      = m_CounterXDoubletsS3S2_2ndloop.buffer();
-  auto CounterXDoubletsS3S2S1            = m_CounterXDoubletsS3S2S1.buffer();
-  auto CounterXDoubletsS3S2S1_2ndloop    = m_CounterXDoubletsS3S2S1_2ndloop.buffer();
-
   // I want this in GeV but default in LHCb is MeV so this is a compromise to have the property in MeV
   float const minPTCutValue = p_minPt / float{Gaudi::Units::GeV};
 
-  using dType = SIMDWrapper::scalar::types;
-  using F     = dType::float_v;
-  using I     = dType::int_v;
-
-  // number of iterations for searching doublets
-  // First iteration, performed over all VeloUT reconstructed tracks:
-  //     doublets of hits are searched in the S3L0 --> S3L3 layers,
-  //     they are then prolonged to S2 and S1
-  // Second iteration, performed over the VeloUT tracks
-  // which have not been prolonged to the SciFi in the first iteration:
-  //     doublets of hits are searched in the S2L0 --> S2L3 layers,
-  //     they are then prolonged to S3 and S1
-  const unsigned int num_iterations = p_SecondLoop ? 2 : 1;
-
+  auto buffer_2_loop = m_counter_2_loop.buffer();
   // start our loop over UT tracks
-  // for ( auto const& uttrack : tracks ) {
-  for ( int uttrack = 0; uttrack < tracks.size(); uttrack += dType::size ) {
+  //
+  auto const vec_mag_scale     = vF{m_magscalefactor};
+  auto const neg_vec_mag_scale = vF{-1.f * m_magscalefactor};
+  for ( int uttrack = 0; uttrack < tracks.size(); uttrack += vec::size ) {
 
+    auto loop_mask = vec::loop_mask( uttrack, tracks.size() );
     // placeholder for the best candidate
-    SciFiTrackForwardingCand bestcandidate{};
     mydebug( "++++++++++ Start new UT track ++++++++++++++++" );
 
-    Vec3<F> const endv_pos = tracks.statePos<F>( uttrack );
-    Vec3<F> const endv_dir = tracks.stateDir<F>( uttrack );
+    // Vec3<sF> const& endv_pos = tracks.statePos<sF>( uttrack );
+    // Vec3<sF> const& endv_dir = tracks.stateDir<sF>( uttrack );
+    auto const& endv_pos = tracks.statePos<vF>( uttrack );
+    auto const& endv_dir = tracks.stateDir<vF>( uttrack );
 
     // everything I need from the end velo state
-    float const endv_x    = endv_pos.x.cast();
-    float const endv_y    = endv_pos.y.cast();
-    float const endv_tx   = endv_dir.x.cast();
-    float const endv_ty   = endv_dir.y.cast();
-    float const endv_qp   = tracks.stateQoP<F>( uttrack ).cast();
-    float const endv_z    = endv_pos.z.cast();
-    float const endv_tx2  = endv_tx * endv_tx;
-    float const endv_ty2  = endv_ty * endv_ty;
-    float const endv_txy2 = endv_tx2 + endv_ty2;
-    float const endv_txy  = std::sqrt( endv_txy2 );
-    float const endv_pq   = 1.f / endv_qp;
-    float const endv_p    = std::abs( endv_pq );
-    float const endv_pz   = endv_p / std::sqrt( 1.f + endv_txy2 );
-    float const endv_pt   = endv_pz * endv_txy;
-
-    mydebug( "pt: ", endv_pt, "preselpt", p_preSelPT );
-    if ( endv_pt < p_preSelPT ) continue;
-    CounterAcceptedBuffer += 1;
+    auto const endv_tx   = endv_dir.x;
+    auto const endv_ty   = endv_dir.y;
+    auto const endv_qp   = tracks.stateQoP<vF>( uttrack );
+    auto const endv_y    = endv_pos.y;
+    auto const endv_z    = endv_pos.z;
+    auto const endv_ty2  = endv_ty * endv_ty;
+    auto const endv_txy2 = endv_tx * endv_tx + endv_ty2;
 
     // determine if a track is expected to intersect the seeding station from the top or the bottom
     // check is performed at last layer of second station.
-    bool const TrackGoesTroughUpperHalf = ( endv_y + ( cache.LayerZPos[8] - endv_z ) * endv_ty ) > 0;
-
-    // extrapolate velo track y-position into all 12 Layers
-    // IIFE (Immediately invoked function expression) to keep yAtZ const
-    // compiler explorer tested that this isn't producing overhead over simple loop
-    auto const yAtZ = [&]() {
-      std::array<float, 12> tmp;
-
-      // start at beginning or middle of array depending on if track goes through upper or lower half of detector
-      // see creation of GeomCache for more info
-      const auto halfSize = cache.LayerZPos.size() / 2;
-      int const  cnt      = TrackGoesTroughUpperHalf ? halfSize : 0;
-
-      std::transform( cache.LayerZPos.begin() + cnt, cache.LayerZPos.begin() + 12 + cnt, tmp.begin(),
-                      [=]( float const z ) { return endv_y + ( z - endv_z ) * endv_ty; } );
-      return tmp;
-    }();
-
-    mydebug( "yatz: ", yAtZ );
-
-    // adjust the z-position to account for the predicted y-position of the track
-    auto const betterZ = [&]() {
-      std::array<float, 12> tmp;
-
-      // start at beginning or middle of array depending on if track goes through upper or lower half of detector
-      // see creation of GeomCache for more info
-      const auto halfSize = cache.LayerZPos.size() / 2;
-      int        cnt      = TrackGoesTroughUpperHalf ? halfSize : 0;
-
-      std::transform( yAtZ.begin(), yAtZ.end(), cache.LayerZPos.begin() + cnt, tmp.begin(),
-                      [&]( float const y, float const z ) { return z + y * cache.dZdYFiber[cnt++]; } );
-      return tmp;
-    }();
-
-    mydebug( "betterz: ", TrackGoesTroughUpperHalf ? " upper half" : " lower half", betterZ );
-
-    auto const dXdYLayer = [&]() {
-      std::array<float, 6> tmp;
-      // start at beginning or middle of array depending on if track goes through upper or lower half of detector
-      // see creation of GeomCache for more info
-      const auto halfSize = cache.dXdYFiber.size() / 2;
-      int        cnt      = TrackGoesTroughUpperHalf ? halfSize : 0;
-      std::transform( tmp.begin(), tmp.end(), cache.dXdYFiber.begin() + cnt, tmp.begin(),
-                      [=]( float const /*unused*/, float const dxdy ) { return -dxdy; } );
-      return tmp;
-    }();
-
-    mydebug( "dXdYLayer", TrackGoesTroughUpperHalf ? " upper half" : " lower half", dXdYLayer );
-
-    // start at beginning or middle of array depending on if track goes through upper or lower half of detector
-    // see creation of GeomCache for more info
-    const auto halfSize      = cache.ExtFacS2.size() / 2;
-    int const  ExtFac_Offset = TrackGoesTroughUpperHalf ? halfSize : 0;
+    auto const ones                = vI{1};
+    auto const zeroes              = vI{0};
+    auto const track_in_upper_half = signselect( ( endv_y + ( cache.LayerZPos[8] - endv_z ) * endv_ty ), ones, zeroes );
+    // offset for LayerZPos & ext_coeff and dxdy array
+    auto const up_down_offset = to_std_array( track_in_upper_half * ( cache.LayerZPos.size() / 2 ) );
 
     // charge * magnet polarity factor needed in various polynomials below
-    float const factor = std::copysign( 1.f, endv_qp ) * m_magscalefactor;
+    auto const factor = signselect( endv_qp, vec_mag_scale, neg_vec_mag_scale );
 
     // charge over momentum is usally in MeV so make it GeV
-    float const qOpGeV = endv_qp * float{Gaudi::Units::GeV} * m_magscalefactor;
+    auto const qOpGeV = endv_qp * float{Gaudi::Units::GeV} * m_magscalefactor;
+    // calculate an upper error boundary on the tracks x-prediction,
+    auto const upper_error =
+        min( max( p_UpperLimitOffset.value() + p_UpperLimitSlope.value() * abs( qOpGeV ), p_UpperLimitMin.value() ),
+             p_UpperLimitMax.value() );
+
+    // calculate a lower error boundary on the tracks x-prediction,
+    auto const lower_error =
+        min( p_LowerLimitOffset.value() + p_LowerLimitSlope.value() * abs( qOpGeV ), p_LowerLimitMax.value() );
 
     // extrapolate the track x-position into the scifi to determine search windows
     // common term for both extrapolations below
-    float const term1 =
+    auto const term1 =
         toSciFiExtParams[0] + endv_tx * ( -toSciFiExtParams[1] * factor + toSciFiExtParams[2] * endv_tx ) +
         endv_ty2 * ( toSciFiExtParams[3] + endv_tx * ( toSciFiExtParams[4] * factor + toSciFiExtParams[5] * endv_tx ) );
 
     // prediction of tracks x position with ut momentum estimate
-    float const xExt = qOpGeV * ( term1 + qOpGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * qOpGeV ) );
+    auto const xExt = qOpGeV * ( term1 + qOpGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * qOpGeV ) );
 
     // 1/p, given a minPT cut and the tracks slopes
-    float const minInvPGeV = factor / minPTCutValue * endv_txy / std::sqrt( 1.f + endv_txy2 );
+    auto const minInvPGeV = factor / minPTCutValue * sqrt( endv_txy2 ) / sqrt( 1.f + endv_txy2 );
 
     // given the above value of 1/p this should be the tracks x-position
     // we can use this to make our windows smaller given a minPT cut
-    float const minPTBorder =
+    auto const min_pt_border =
         minInvPGeV * ( term1 + minInvPGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * minInvPGeV ) );
 
-    // set start and end points in 1D hit array depending on if a track is in upper or lower detector half
-    mydebug( "Selecting upper or lower  hits", TrackGoesTroughUpperHalf );
-    auto const Xzones    = TrackGoesTroughUpperHalf ? PrFTInfo::xZonesUpper : PrFTInfo::xZonesLower;
-    auto const UVzones   = TrackGoesTroughUpperHalf ? PrFTInfo::uvZonesUpper : PrFTInfo::uvZonesLower;
-    int const  startS3L0 = hithandler.zonerange[Xzones[4]].first;
-    int const  startS3L1 = hithandler.zonerange[UVzones[4]].first;
-    int const  startS3L2 = hithandler.zonerange[UVzones[5]].first;
-    int const  startS3L3 = hithandler.zonerange[Xzones[5]].first;
-
-    int const startS2L0 = hithandler.zonerange[Xzones[2]].first;
-    int const startS2L1 = hithandler.zonerange[UVzones[2]].first;
-    int const startS2L2 = hithandler.zonerange[UVzones[3]].first;
-    int const startS2L3 = hithandler.zonerange[Xzones[3]].first;
-
-    // each variable is a pair [startIdx, size]
-    auto const S1L0range = hithandler.zonerange[Xzones[0]];
-    auto const S1L1range = hithandler.zonerange[UVzones[0]];
-    auto const S1L2range = hithandler.zonerange[UVzones[1]];
-    auto const S1L3range = hithandler.zonerange[Xzones[1]];
-
-    //
-    // ########################################################
-    // loop over the iterations for searching doublets
-    // ########################################################
-    //
-    for ( unsigned int iteration = 0; iteration < num_iterations; ++iteration ) {
-
-      mydebug( "++++++++++ Start iteration", iteration, " ++++++++++" );
-
-      // define some magic parameters for all the layers, depending on the current iteration
-      float const ExtFac_alpha_S1L0 =
-          ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset] : cache.ExtFacS2[ExtFac_Offset];
-      float const ExtFac_beta_S1L0 =
-          ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 1] : cache.ExtFacS2[ExtFac_Offset + 1];
-      float const ExtFac_gamma_S1L0 =
-          ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 2] : cache.ExtFacS2[ExtFac_Offset + 2];
-
-      float const ExtFac_alpha_S1L3 =
-          ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 3] : cache.ExtFacS2[ExtFac_Offset + 3];
-      float const ExtFac_beta_S1L3 =
-          ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 4] : cache.ExtFacS2[ExtFac_Offset + 4];
-      float const ExtFac_gamma_S1L3 =
-          ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 5] : cache.ExtFacS2[ExtFac_Offset + 5];
-
-      // well-defined for the first iteration only,
-      // for which the seeding doublet is searched in S3
-      float const ExtFac_alpha_S2L0 = ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 6] : 0.f;
-      float const ExtFac_beta_S2L0  = ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 7] : 0.f;
-      float const ExtFac_gamma_S2L0 = ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 8] : 0.f;
-
-      float const ExtFac_alpha_S2L3 = ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 9] : 0.f;
-      float const ExtFac_beta_S2L3  = ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 10] : 0.f;
-      float const ExtFac_gamma_S2L3 = ( iteration == 0 ) ? cache.ExtFacS3[ExtFac_Offset + 11] : 0.f;
-
-      // well-defined for the second iteration only,
-      // for which the seeding doublet is searched in S2
-      float const ExtFac_alpha_S3L0 = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 6];
-      float const ExtFac_beta_S3L0  = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 7];
-      float const ExtFac_gamma_S3L0 = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 8];
-
-      float const ExtFac_alpha_S3L3 = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 9];
-      float const ExtFac_beta_S3L3  = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 10];
-      float const ExtFac_gamma_S3L3 = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 11];
-
-      // first iteration: delta in Z between kink position in magnet and z position of S3L0
-      float const InvDeltaZMagS3L0 = 1.f / ( betterZ[8] - zPosKinkMagnet_S3 );
-
-      // second iteration: delta in Z between kink position in magnet and z position of S2L0
-      float const InvDeltaZMagS2L0 = 1.f / ( betterZ[4] - zPosKinkMagnet_S2 );
-
-      // extrapolate velo track x-position into magnet,
-      // for S3 (used in the first iteration) and S2 (used in the second iteration)
-      float const xAtMagnet_S2 = endv_x + ( zPosKinkMagnet_S2 - endv_z ) * endv_tx;
-      float const xAtMagnet_S3 = endv_x + ( zPosKinkMagnet_S3 - endv_z ) * endv_tx;
-
-      // extrapolate the velo straight into the SciFi
-      // first iteration:  use the z position of S3L0
-      // second iteration: use the z position of S2L0
-      float const straighExt =
-          ( iteration == 0 ) ? endv_x + ( betterZ[8] - endv_z ) * endv_tx : endv_x + ( betterZ[4] - endv_z ) * endv_tx;
-
-      // calculate an upper error boundary on the tracks x-prediction,
-      // depending on the current iteration
-      float const upperError =
-          ( iteration == 0 ) ? std::clamp( p_UpperLimitOffset + p_UpperLimitSlope * std::abs( qOpGeV ),
-                                           p_UpperLimitMin.value(), p_UpperLimitMax.value() )
-                             : std::clamp( p_UpperLimitOffset_2ndloop + p_UpperLimitSlope_2ndloop * std::abs( qOpGeV ),
-                                           p_UpperLimitMin_2ndloop.value(), p_UpperLimitMax_2ndloop.value() );
-
-      // calculate a lower error boundary on the tracks x-prediction,
-      // depending on the current iteration
-      float const lowerError =
-          ( iteration == 0 )
-              ? std::min( p_LowerLimitOffset + p_LowerLimitSlope * std::abs( qOpGeV ), p_LowerLimitMax.value() )
-              : std::min( p_LowerLimitOffset_2ndloop + p_LowerLimitSlope_2ndloop * std::abs( qOpGeV ),
-                          p_LowerLimitMax_2ndloop.value() );
-
-      // depending on the sign of q set the lower and upper error in the right direction
-      // on top of that make sure that the upper error isn't larger than the minPTBorder
-      // and the lower error isn't bigger than xExt which means we don't look further than the straight line prediction
-      float xMin = factor > 0 ? straighExt + ( ( xExt < lowerError ) ? 0 : ( xExt - lowerError ) )
-                              : straighExt + std::max( xExt - upperError, minPTBorder );
-
-      float xMax = factor > 0 ? straighExt + std::min( xExt + upperError, minPTBorder )
-                              : straighExt + ( ( xExt > -lowerError ) ? 0 : ( xExt + lowerError ) );
-
-      if ( endv_pt > p_wrongSignPT ) {
-        xMin = straighExt - std::abs( xExt ) - upperError;
-        xMax = straighExt + std::abs( xExt ) + upperError;
+    auto const a_qp                  = to_std_array( endv_qp );
+    auto const a_factor              = to_std_array( factor );
+    auto const a_upper_error         = to_std_array( upper_error );
+    auto const a_lower_error         = to_std_array( lower_error );
+    auto const a_xExt                = to_std_array( xExt );
+    auto const a_min_pt_border       = to_std_array( min_pt_border );
+    auto const a_track_in_upper_half = to_std_array( track_in_upper_half );
+
+    for ( size_t tr{0}; tr < vec::size; ++tr ) {
+      if ( UNLIKELY( !testbit( loop_mask, tr ) ) ) { break; }
+      // extrapolate velo track y-position into all 12 Layers
+      // IIFE (Immediately invoked function expression) to keep yAtZ const
+      // compiler explorer tested that this isn't producing overhead over simple loop
+      auto const& scalar_endv_pos = tracks.statePos<sF>( uttrack + tr );
+      auto const& scalar_endv_dir = tracks.stateDir<sF>( uttrack + tr );
+
+      auto const yAtZ = [&]() {
+        std::array<float, 12> tmp;
+        std::transform( cache.LayerZPos.begin() + up_down_offset[tr], cache.LayerZPos.begin() + 12 + up_down_offset[tr],
+                        tmp.begin(), [=]( float const z ) {
+                          return scalar_endv_pos.y.cast() + ( z - scalar_endv_pos.z.cast() ) * scalar_endv_dir.y.cast();
+                        } );
+        return tmp;
+      }();
+
+      mydebug( "yatz: ", yAtZ );
+
+      auto bestcandidate{find_track<config_loop_1>( cache, hithandler, a_track_in_upper_half[tr], scalar_endv_pos,
+                                                    scalar_endv_dir, a_qp[tr], a_factor[tr], a_upper_error[tr],
+                                                    a_lower_error[tr], a_xExt[tr], a_min_pt_border[tr], yAtZ )};
+
+      mydebug( "Search 1 done!" );
+      if ( bestcandidate.ids.empty() && p_SecondLoop.value() ) {
+        mydebug( "Search 2nd loop!" );
+        ++buffer_2_loop;
+        bestcandidate =
+            find_track<config_loop_2>( cache, hithandler, a_track_in_upper_half[tr], scalar_endv_pos, scalar_endv_dir,
+                                       a_qp[tr], a_factor[tr], 0.87f * a_upper_error[tr], 0.87f * a_lower_error[tr],
+                                       0.87f * a_xExt[tr], 0.87f * a_min_pt_border[tr], yAtZ );
       }
+      mydebug( "Search done!" );
 
-      mydebug( "==================" );
-      mydebug( "qOpGeV", qOpGeV, "straight: ", straighExt, "xExt ", xExt, " dxref ", minPTBorder, "lowError",
-               lowerError, "upError: ", upperError, "xmin", xMin, "xmax: ", xMax, "minInvPGeV", minInvPGeV );
-
-      // a range is a pair of start and size
-      // first iteration: start with S3L0
-      // second iteration: start with S2L0
-      auto startRange = ( iteration == 0 ) ? hithandler.zonerange[Xzones[4]] : hithandler.zonerange[Xzones[2]];
-
-      int const startL0 =
-          getUpperBoundBinary( hithandler.hits, startRange.first, startRange.first + startRange.second, xMin );
-      int const EndL0 = getUpperBoundBinary( hithandler.hits, startL0, startRange.first + startRange.second, xMax );
-
-      // some variables to cache offsets
-      int startoffsetS3L0{startS3L0};
-      int startoffsetS3L1{startS3L1};
-      int startoffsetS3L2{startS3L2};
-      int startoffsetS3L3{startS3L3};
-
-      // FIXME these offsets seem to work in the tested minPt=.49 scenario
-      // These are not always 100% correct though
-      int startoffsetS2L0{startS2L0};
-      int startoffsetS2L1{startS2L1};
-      int startoffsetS2L2{startS2L2};
-      int startoffsetS2L3{startS2L3};
-
-      // z delta between the two layers of the seeding station
-      // first iteration:  z(S3L3) - z(S3L0)
-      // second iteration: z(S2L3) - z(S2L0)
-      float const zdelta_doublet_XLayers = ( iteration == 0 ) ? betterZ[11] - betterZ[8] : betterZ[7] - betterZ[4];
-
-      // inverse of the above quantity
-      float const inv_zdelta_doublet_XLayers =
-          ( iteration == 0 ) ? 1.f / ( betterZ[11] - betterZ[8] ) : 1.f / ( betterZ[7] - betterZ[4] );
-
-      // ready to start a new second loop
-      if ( iteration > 0 ) ++CounterSecondLoopBuffer;
-
-      // Loop over the hits in the first x-layer within the search window
-      for ( auto idxL0{startL0}; idxL0 < EndL0; ++idxL0 ) {
-
-        mydebug( "#####################Start new hit################################" );
-
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-        //                  Start of search in the first station               //
-        //                  First iteration:  S3                               //
-        //                  Second iteration: S2                               //
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-
-        // get the x value of hit
-        // first iteration:  S3x0
-        // second iteration: S2x0
-        float const firstXhit = hithandler.hits[idxL0];
-
-        // calculate a rough estimate of the expected slope,
-        // depending on the current iteration
-        float const tx = ( iteration == 0 ) ? ( firstXhit - xAtMagnet_S3 ) * InvDeltaZMagS3L0
-                                            : ( firstXhit - xAtMagnet_S2 ) * InvDeltaZMagS2L0;
-
-        // this is a small correction mainly for lower momentum tracks for which the zkink isn't perfect
-        // and they'll have some bending. This is solely empirical.
-        // first iteration:  curvature to S3L3
-        // second iteration: curvature to S2L3
-        float const doubletCurve =
-            ( iteration == 0 )
-                ? ( tx - endv_tx ) * ( -0.231615f + ( endv_tx * 33.1256f + tx * 10.4693f ) * ( tx - endv_tx ) )
-                : ( tx - endv_tx ) * ( -1.050f + ( endv_tx * 17.74f + tx * 20.62f ) * ( tx - endv_tx ) );
-
-        // piece together the x-prediction in the last x layer
-        float const secondXhitPred = firstXhit + tx * zdelta_doublet_XLayers + doubletCurve;
-
-        mydebug( "firstXhit", firstXhit, "lhcbid", hithandler.IDs[idxL0], "secondXhitPred", secondXhitPred, "tx", tx );
-
-        // quit if we are outside of the sensitive area
-        if ( std::abs( secondXhitPred ) > 3200.f ) continue;
-
-        float secondXhit;
-
-        if ( iteration == 0 ) S3buffer += 1;
-
-        // FIXME this is the most expensive search, luckily we can cache the starting point
-        // however the first search is still costly, maybe we can perform the first search with an avx2 search
-        // before the loop?
-        auto const startoffset               = ( iteration == 0 ) ? startoffsetS3L3 : startoffsetS2L3;
-        auto const [selectsecondXhit, Delta] = SearchLayerForHitLinear( hithandler.hits, startoffset, secondXhitPred );
-
-        // cache an offset for the next search
-        if ( iteration == 0 )
-          startoffsetS3L3 = selectsecondXhit;
-        else
-          startoffsetS2L3 = selectsecondXhit;
-
-        // x position of the second hit of the doublet
-        secondXhit = hithandler.hits[selectsecondXhit];
-
-        // calculate x-slope of the found doublet
-        float const newtx = ( secondXhit - firstXhit ) * inv_zdelta_doublet_XLayers;
-
-        // deltaSlope is proportional to the momentum and used in many places to tune momentum dependant search windows
-        float const deltaSlope = newtx - endv_tx;
-
-        // the minimum value of 0.011 avoids too small search windows for tracks with P > 100 GeV
-        float const absDSlope = std::max( 0.011f, std::abs( deltaSlope ) );
-
-        // check if the x-hit is within our search window acceptance, depending on the current iteration
-        if ( iteration == 0 ) {
-          if ( Delta > std::min( p_DW_b + p_DW_m * absDSlope, p_DW_max.value() ) ) continue;
-        } else {
-          // second iteration
-          if ( Delta > std::min( p_S2L3W_b_2ndloop + p_S2L3W_m_2ndloop * absDSlope, p_DW_max_2ndloop.value() ) )
-            continue;
-        }
+      if ( !bestcandidate.ids.empty() ) {
+        int i = Output.size();
+        mydebug( "saving track: ", i );
+        int mask = true;
 
-        // we have a doublet and are going to try and build a track
-        // for that we need a couple tmp objects to keep track of stuff
-        // tmpidvec will store the indices of the hit to refind it later
-        boost::container::static_vector<int, 12> tmpidvec{selectsecondXhit, idxL0};
+        Output.compressstore_trackVP<sI>( i, mask, tracks.trackVP<sI>( uttrack + tr ) );
+        Output.compressstore_trackUT<sI>( i, mask, uttrack + tr );
 
-        if ( iteration == 0 ) {
-          ++CounterXDoubletsS3;
-        } else {
-          ++CounterXDoubletsS2_2ndloop;
-        }
+        float const qop = 1.f / bestcandidate.PQ;
+        Output.compressstore_stateQoP<sF>( i, mask, qop );
+
+        int n_hits = 0;
 
-        // this is a vector that will remember some values for the linear fit performed at the very end.
-        // for each hit this will store 4 values, [xvalue, zdelta, curvature, residual]
-        // where zdelta is the distance between the hit and the hit in layer 0 of the seeding station
-        // curvature is the non-linear part of the prediction. See x3curve variable for example
-        std::array<float, 24> fitvec{firstXhit, 0, 0, 0, secondXhit, zdelta_doublet_XLayers, doubletCurve, 0};
-
-        // fitcnt simply keeps track of what's in the array
-        int fitcnt = 8;
-
-        float const direction = -1.f * std::copysign( 1.f, deltaSlope );
-
-        // correct the straight line estimate of the y-position at station 3 layer 0
-        float const ycorr =
-            absDSlope * ( direction * deltaYParams[0] +
-                          endv_ty * ( deltaYParams[1] + deltaYParams[5] * endv_ty2 +
-                                      endv_tx * ( direction * ( deltaYParams[2] + deltaYParams[6] * endv_ty2 ) +
-                                                  endv_tx * ( deltaYParams[3] + deltaYParams[7] * endv_ty2 +
-                                                              deltaYParams[4] * direction * endv_tx ) ) ) );
-
-        // these variables are saved for later as it is quite good for ghost rejection
-        float S2UVDelta( 1.0f ), S3UVDelta( 1.0f );
-
-        // keep track of how many hits we find
-        int numuvhits{0};
-        int numxhits{2};
-
-        // to handle correctly the ordering of tmpidvec in the second iteration
-        int selectS2L1( -1 ), selectS2L2( -1 );
-
-        if ( iteration == 0 ) {
-
-          // first iteration:
-          // calculate the x-predictions for the U V layers in station 3
-          float const S3x1pred = firstXhit + newtx * ( betterZ[9] - betterZ[8] ) + ( yAtZ[9] - ycorr ) * dXdYLayer[4];
-          float const S3x2pred =
-              firstXhit + newtx * ( betterZ[10] - betterZ[8] ) + ( yAtZ[10] - 1.03f * ycorr ) * dXdYLayer[5];
-
-          S3UVbuffer += 1;
-
-          // search for hits in layer
-          auto const [selectS3L1, DeltaS3L1] = SearchLayerForHitLinear( hithandler.hits, startoffsetS3L1, S3x1pred );
-          auto const [selectS3L2, DeltaS3L2] = SearchLayerForHitLinear( hithandler.hits, startoffsetS3L2, S3x2pred );
-
-          // cache the position for the next search
-          startoffsetS3L1 = selectS3L1;
-          startoffsetS3L2 = selectS3L2;
-          mydebug( "deltas x1, x2: ", DeltaS3L1, DeltaS3L2, hithandler.IDs[selectS3L1], hithandler.IDs[selectS3L2] );
-
-          // calculate acceptance window for uv hits based on momentum
-          float const S3uvwindow = ( p_UV3W_b + p_UV3W_m * absDSlope );
-
-          if ( DeltaS3L2 < S3uvwindow ) {
-            // I found a hit in S3L2
-            tmpidvec.push_back( selectS3L2 );
-            ++numuvhits;
-          }
-
-          if ( DeltaS3L1 < S3uvwindow ) {
-            // I found a hit in S3L1
-            tmpidvec.push_back( selectS3L1 );
-            ++numuvhits;
-          } else if ( numuvhits < 1 )
-            // didn't find any uv hits let's try our luck in the next loop iteration
-            continue;
-
-          // this variable is saved for later as it is quite good for ghost rejection
-          S3UVDelta =
-              numuvhits < 2
-                  ? 1.0f
-                  : 1.0f + std::abs( hithandler.hits[selectS3L1] - S3x1pred + hithandler.hits[selectS3L2] - S3x2pred );
-
-        } else {
-          // second iteration:
-          // calculate the x-predictions for the U V layers in station 2
-          float const S2x1pred =
-              firstXhit + newtx * ( betterZ[5] - betterZ[4] ) + ( yAtZ[5] - .83f * ycorr ) * dXdYLayer[2];
-          float const S2x2pred =
-              firstXhit + newtx * ( betterZ[6] - betterZ[4] ) + ( yAtZ[6] - .85f * ycorr ) * dXdYLayer[3];
-
-          // search for hits in the layers
-          auto const [selectS2L1_temp, DeltaS2L1] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS2L1, S2x1pred );
-          auto const [selectS2L2_temp, DeltaS2L2] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS2L2, S2x2pred );
-
-          // cache the position for the next search
-          startoffsetS2L1 = selectS2L1_temp;
-          startoffsetS2L2 = selectS2L2_temp;
-          mydebug( "deltas x1, x2: ", DeltaS2L1, DeltaS2L2, hithandler.IDs[selectS2L1],
-                   hithandler.IDs[selectS2L2_temp] );
-
-          // calculate the acceptance window for uv hits based on momentum
-          float const S2uvwindow = ( p_UV2W_b_2ndloop + p_UV2W_m_2ndloop * absDSlope );
-
-          if ( DeltaS2L2 < S2uvwindow ) {
-            // I found a hit in S2L2
-            selectS2L2 = selectS2L2_temp;
-            tmpidvec.push_back( selectS2L2 );
-            ++numuvhits;
-          }
-
-          if ( DeltaS2L1 < S2uvwindow ) {
-            // I found a hit in S2L1
-            selectS2L1 = selectS2L1_temp;
-            tmpidvec.push_back( selectS2L1 );
-            ++numuvhits;
-
-          } else if ( numuvhits < 1 )
-            // didn't find any uv hits let's try our luck in the next loop iteration
-            continue;
-
-          // this variable is saved for later as it is quite good for ghost rejection
-          S2UVDelta =
-              numuvhits < 2
-                  ? 1.0f
-                  : 1.0f + std::abs( hithandler.hits[selectS2L1] - S2x1pred + hithandler.hits[selectS2L2] - S2x2pred );
-
-          // hit threshold in the second iteration
-          if ( ( numuvhits + numxhits ) < p_numHitsThrS2_2ndloop ) continue;
-
-        } // end of the S2 doublet search in the second iteration
-
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-        //                  Start of search in the second station              //
-        //                  First iteration:  S3                               //
-        //                  Second iteration: S2                               //
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-
-        // most of what happens below is similar to station 2
-
-        // these variables will stay true if x-hits are found in both layers
-        // first iteration: S3Doublet
-        // second iteration: S2Doublet
-        bool S3Doublet = true;
-        bool S2Doublet = true;
-
-        // temp objects to handle correctly the different iterations
-        int selectS3L0( -1 ), selectS2L0( -1 );
-        int selectS3L3( -1 ), selectS2L3( -1 );
-
-        if ( iteration == 0 ) {
-          // first iteration: search for doublets in S2
-          // doublets have been found in S3 already
-
-          float const S2x0curve = ExtFac_alpha_S2L0 * deltaSlope + ExtFac_beta_S2L0 * deltaSlope +
-                                  ExtFac_gamma_S2L0 * deltaSlope * endv_ty2;
-
-          float S2x0pred = firstXhit + newtx * ( betterZ[4] - betterZ[8] ) + S2x0curve;
-
-          float const S2x3curve = deltaSlope * ExtFac_alpha_S2L3 + deltaSlope * ExtFac_beta_S2L3 +
-                                  deltaSlope * ExtFac_gamma_S2L3 * endv_ty2;
-
-          float S2x3pred = firstXhit + newtx * ( betterZ[7] - betterZ[8] ) + S2x3curve;
-
-          S2buffer += 1;
-
-          // get the best hit candidate
-          auto const [selectS2L3_temp, DeltaS2L3] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS2L3, S2x3pred );
-
-          // cache the position for the next search
-          startoffsetS2L3 = selectS2L3_temp;
-
-          // did we find a hit?
-          if ( DeltaS2L3 < ( p_S2L3W_b + p_S2L3W_m * absDSlope ) ) {
-            selectS2L3 = selectS2L3_temp;
-
-            tmpidvec.push_back( selectS2L3 );
-            ++numxhits;
-
-            // fill fit vec with the x-hit information
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS2L3], betterZ[7] - betterZ[8], S2x3curve,
-                        hithandler.hits[selectS2L3] - S2x3pred );
-          } else {
-            // we didn't find a hit and thus also not a doublet
-            S2Doublet = false;
-
-            // do we even really want to continue with this track or is it crap?
-            if ( numxhits + numuvhits < 4 ) continue;
-          }
-
-          // get the hit candidate for S2L0
-          auto const [selectS2L0_temp, DeltaS2L0] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS2L0, S2x0pred );
-
-          // cache position for next search
-          startoffsetS2L0 = selectS2L0_temp;
-
-          // did we find a hit?
-          if ( DeltaS2L0 < ( p_S2L0W_b + p_S2L0W_m * absDSlope ) ) {
-            selectS2L0 = selectS2L0_temp;
-            tmpidvec.push_back( selectS2L0 );
-            ++numxhits;
-
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS2L0], betterZ[4] - betterZ[8], S2x0curve,
-                        hithandler.hits[selectS2L0] - S2x0pred );
-          } else {
-            S2Doublet = false;
-            if ( numxhits + numuvhits < 5 ) continue;
-          }
-        } else {
-          // second iteration: search for doublets in S3
-          // a doublet has been found in S2 already
-
-          // make x-predictions
-          float const S3x0curve = ExtFac_alpha_S3L0 * deltaSlope + ExtFac_beta_S3L0 * deltaSlope +
-                                  ExtFac_gamma_S3L0 * deltaSlope * endv_ty2;
-
-          float const S3x0pred = secondXhit + newtx * ( betterZ[8] - betterZ[7] ) + S3x0curve;
-
-          float const S3x3curve = deltaSlope * ExtFac_alpha_S3L3 + deltaSlope * ExtFac_beta_S3L3 +
-                                  deltaSlope * ExtFac_gamma_S3L3 * endv_ty2;
-
-          float const S3x3pred = secondXhit + newtx * ( betterZ[11] - betterZ[7] ) + S3x3curve;
-
-          // quit if we are outside of the sensitive area
-          if ( std::abs( S3x3pred ) > 3200.f ) continue;
-
-          mydebug( "S3X0, S3X3", S3x0pred, S3x3pred, "newtx", newtx, "dZsS3L0", betterZ[8] - betterZ[7], "dZsS3L3",
-                   betterZ[11] - betterZ[7] );
-
-          // get the best hit candidate
-          auto const [selectS3L3_temp, DeltaS3L3] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS3L3, S3x3pred );
-
-          // cache the position for the next search
-          startoffsetS3L3 = selectS3L3_temp;
-          mydebug( "selectS3L3", selectS3L3, " delta S3L3: ", DeltaS3L3, hithandler.IDs[selectS3L3_temp] );
-
-          // did we find a hit?
-          if ( DeltaS3L3 < ( p_S3L3W_b_2ndloop + p_S3L3W_m_2ndloop * absDSlope ) ) {
-            selectS3L3 = selectS3L3_temp;
-            tmpidvec.push_back( selectS3L3 );
-
-            ++numxhits;
-
-            // here I use (betterZ[11] - betterZ[4]) instead of (betterZ[11] - betterZ[7])
-            // otherwise most of the candidates won't survive the quality factor.
-            // The factors used to get the candidate quality should be retuned, I guess
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS3L3], betterZ[11] - betterZ[4], S3x3curve,
-                        hithandler.hits[selectS3L3] - S3x3pred );
-          } else {
-            // we didn't find a hit and thus also not a doublet
-            S3Doublet = false;
-
-            // do we even really want to continue with this track or is it crap?
-            if ( numxhits + numuvhits < 4 ) continue;
-          }
-
-          // get the hit candidate for S3L0
-          auto const [selectS3L0_temp, DeltaS3L0] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS3L0, S3x0pred );
-
-          // cache the position for the next search
-          startoffsetS3L0 = selectS3L0_temp;
-          mydebug( "selectS3L0", selectS3L0, " delta S3L0: ", DeltaS3L0, hithandler.IDs[selectS3L0] );
-
-          // did we find a hit?
-          if ( DeltaS3L0 < ( p_S3L0W_b_2ndloop + p_S3L0W_m_2ndloop * absDSlope ) ) {
-            selectS3L0 = selectS3L0_temp;
-            tmpidvec.push_back( selectS3L0 );
-
-            ++numxhits;
-
-            // here I use (betterZ[8] - betterZ[4]) instead of (betterZ[8] - betterZ[7])
-            // otherwise most of the candidates won't survive the quality factor.
-            // The factors used to get the candidate quality should be retuned, I guess
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS3L0], betterZ[8] - betterZ[4], S3x0curve,
-                        hithandler.hits[selectS3L0] - S3x0pred );
-          } else {
-            S3Doublet = false;
-            if ( numxhits + numuvhits < 5 ) continue;
-          }
-        } // end of the second iteration
-
-        if ( iteration == 0 ) {
-          ++CounterXDoubletsS3S2;
-        } else {
-          ++CounterXDoubletsS3S2_2ndloop;
+        for ( auto idx{bestcandidate.ids.begin()}; idx != bestcandidate.ids.end(); ++idx, ++n_hits ) {
+          Output.compressstore_hit<sI>( i, n_hits, mask, hithandler.IDs[*idx] );
         }
 
-        // we only check for uv hits if we found a doublet
-        // this choice was made since a doublet and it's slope allow for small search windows
-        // FIXME we could potentially allow this search in other/all scenarios but this need more tweaking
-        // Note that this search will only help with ghost rejection, not help with efficiencies
-        // Thus as long as ghosts aren't a problem there isn't really a big reason to loosen this requirement
-        if ( ( iteration == 0 ) && S2Doublet ) {
+        Output.compressstore_nHits<sI>( i, mask, bestcandidate.numHits );
 
-          S2UVbuffer += 1;
+        // AtT State
+        float const endT_z = cache.LayerZPos[8];
+        float const endT_x = bestcandidate.newx0;
+        auto        endT_y = scalar_endv_pos.y + scalar_endv_dir.y * ( endT_z - scalar_endv_pos.z );
 
-          float const S2Tx =
-              ( hithandler.hits[selectS2L3] - hithandler.hits[selectS2L0] ) / ( betterZ[7] - betterZ[4] );
+        auto endT_pos = Vec3<sF>( endT_x, endT_y, endT_z );
+        auto endT_dir = Vec3<sF>( bestcandidate.finaltx, scalar_endv_dir.y, 1.f );
 
-          float const S2x1pred = hithandler.hits[selectS2L0] + S2Tx * ( betterZ[5] - betterZ[4] ) +
-                                 ( yAtZ[5] - 0.83f * ycorr ) * dXdYLayer[2];
+        Output.compressstore_statePos<sF>( i, mask, endT_pos );
+        Output.compressstore_stateDir<sF>( i, mask, endT_dir );
 
-          float const S2x2pred = hithandler.hits[selectS2L0] + S2Tx * ( betterZ[6] - betterZ[4] ) +
-                                 ( yAtZ[6] - 0.85f * ycorr ) * dXdYLayer[3];
+        Output.size() += scalar::popcount( mask );
 
-          auto const [selectS2L1_temp, DeltaS2L1] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS2L1, S2x1pred );
-          startoffsetS2L1 = selectS2L1_temp;
+      } // save the bestcandidate if we built one
+    }
+  } // loop over the UT tracks
 
-          auto const [selectS2L2_temp, DeltaS2L2] =
-              SearchLayerForHitLinear( hithandler.hits, startoffsetS2L2, S2x2pred );
-          startoffsetS2L2 = selectS2L2_temp;
+  m_counter_created_tracks += Output.size();
+  return Output;
+}
 
-          float const S2uvwindow = ( p_UV2W_b + p_UV2W_m * absDSlope );
+template <typename config>
+[[using gnu: const, hot]] SciFiTrackForwardingCand const SciFiTrackForwarding::find_track(
+    GeomCache const& cache, SciFiTrackForwardingHits const& hithandler, int const track_in_upper_half,
+    Vec3<SIMDWrapper::scalar::float_v> const& endv_pos, Vec3<SIMDWrapper::scalar::float_v> const& endv_dir,
+    float const endv_qp, float const factor, float const upper_error, float const lower_error, float const xExt,
+    float const min_pt_border, std::array<float, 12> const& yAtZ ) const {
+
+  using c = config;
+
+  float const endv_x    = endv_pos.x.cast();
+  float const endv_tx   = endv_dir.x.cast();
+  float const endv_ty   = endv_dir.y.cast();
+  float const endv_z    = endv_pos.z.cast();
+  float const endv_tx2  = endv_tx * endv_tx;
+  float const endv_ty2  = endv_ty * endv_ty;
+  float const endv_txy2 = endv_tx2 + endv_ty2;
+  float const endv_txy  = std::sqrt( endv_txy2 );
+  float const endv_pq   = 1.f / endv_qp;
+  float const endv_p    = std::abs( endv_pq );
+  float const endv_pz   = endv_p / std::sqrt( 1.f + endv_txy2 );
+  float const endv_pt   = endv_pz * endv_txy;
+
+  SciFiTrackForwardingCand bestcandidate{};
+
+  int const   up_down_offset        = track_in_upper_half * ( cache.LayerZPos.size() / 2 );
+  int const   up_down_dxdy_offset   = track_in_upper_half * ( cache.dXdYFiber.size() / 2 );
+  float const seed_station_z_pos_l0 = cache.LayerZPos[up_down_offset + c::seed_first_layer_z_idx];
+
+  // extrapolate velo track x-position into magnet,
+  float const x_at_magnet = endv_x + ( c::zPosKinkMagnet - endv_z ) * endv_tx;
+
+  // extrapolate the velo straight into the SciFi
+  float const straighExt = endv_x + ( seed_station_z_pos_l0 - endv_z ) * endv_tx;
+
+  // depending on the sign of q set the lower and upper error in the right direction
+  // on top of that make sure that the upper error isn't larger than the min_pt_border
+  // and the lower error isn't bigger than xExt which means we don't look further than the straight line prediction
+  float xMin = factor > 0 ? straighExt + ( ( xExt < lower_error ) ? 0 : ( xExt - lower_error ) )
+                          : straighExt + std::max( xExt - upper_error, min_pt_border );
+
+  float xMax = factor > 0 ? straighExt + std::min( xExt + upper_error, min_pt_border )
+                          : straighExt + ( ( xExt > -lower_error ) ? 0 : ( xExt + lower_error ) );
+
+  if ( endv_pt > p_wrongSignPT.value() ) {
+    xMin = straighExt - std::abs( xExt ) - upper_error;
+    xMax = straighExt + std::abs( xExt ) + upper_error;
+  }
 
-          if ( DeltaS2L1 > S2uvwindow and DeltaS2L2 > S2uvwindow ) { continue; }
+  mydebug( "==================" );
+  mydebug( "straight: ", straighExt, "xExt ", xExt, " dxref ", min_pt_border, "lowError", lower_error,
+           "upError: ", upper_error, "xmin", xMin, "xmax: ", xMax );
 
-          if ( DeltaS2L2 < S2uvwindow ) {
-            selectS2L2 = selectS2L2_temp;
-            tmpidvec.push_back( selectS2L2 );
-            ++numuvhits;
-          }
+  // set start and end points in 1D hit array depending on if a track is in upper or lower detector half
+  mydebug( "Selecting upper or lower  hits", track_in_upper_half );
+  auto const Xzones  = track_in_upper_half ? PrFTInfo::xZonesUpper : PrFTInfo::xZonesLower;
+  auto const UVzones = track_in_upper_half ? PrFTInfo::uvZonesUpper : PrFTInfo::uvZonesLower;
 
-          if ( DeltaS2L1 < S2uvwindow ) {
-            selectS2L1 = selectS2L1_temp;
-            tmpidvec.push_back( selectS2L1 );
-            ++numuvhits;
-          }
+  // a range is a pair of start and size
+  auto const seed_station_range_l0 = hithandler.zonerange[Xzones[c::seed_station_idx_first]];
 
-          if ( DeltaS2L1 < S2uvwindow and DeltaS2L2 < S2uvwindow ) {
-            S2UVDelta += std::abs( hithandler.hits[selectS2L1] + hithandler.hits[selectS2L2] - S2x1pred - S2x2pred );
-          }
-        } // end of the UV work in the first iteration
+  int const start_s3_l0 =
+      seed_station_range_l0.first +
+      get_closest_hit_idx_binary( {hithandler.hits.data() + seed_station_range_l0.first, seed_station_range_l0.second},
+                                  xMin );
 
-        // similar thing, but for the second iteration: UV layers of station 3
-        if ( ( iteration > 0 ) && S3Doublet ) {
+  int const end_s3_l0 = start_s3_l0 + get_closest_hit_idx_binary(
+                                          {hithandler.hits.data() + start_s3_l0,
+                                           seed_station_range_l0.second - start_s3_l0 + seed_station_range_l0.first},
+                                          xMax );
 
-          float const S3Tx =
-              ( hithandler.hits[selectS3L3] - hithandler.hits[selectS3L0] ) / ( betterZ[11] - betterZ[8] );
+  int start_seed_station_l1 = hithandler.zonerange[UVzones[c::seed_station_idx_first]].first;
+  int start_seed_station_l2 = hithandler.zonerange[UVzones[c::seed_station_idx_last]].first;
+  int start_seed_station_l3 = hithandler.zonerange[Xzones[c::seed_station_idx_last]].first;
 
-          float const S3x1pred = hithandler.hits[selectS3L0] + S3Tx * ( betterZ[9] - betterZ[8] ) +
-                                 ( yAtZ[9] - 1.f * ycorr ) * dXdYLayer[4];
+  int start_next_station_l0 = hithandler.zonerange[Xzones[c::next_station_idx_first]].first;
+  int start_next_station_l1 = hithandler.zonerange[UVzones[c::next_station_idx_first]].first;
+  int start_next_station_l2 = hithandler.zonerange[UVzones[c::next_station_idx_last]].first;
+  int start_next_station_l3 = hithandler.zonerange[Xzones[c::next_station_idx_last]].first;
 
-          float const S3x2pred = hithandler.hits[selectS3L0] + S3Tx * ( betterZ[10] - betterZ[8] ) +
-                                 ( yAtZ[10] - 1.03f * ycorr ) * dXdYLayer[5];
+  // each variable is a pair [startIdx, size]
+  auto const last_station_range_l0 = hithandler.zonerange[Xzones[c::last_station_idx_first]];
+  auto const last_station_range_l1 = hithandler.zonerange[UVzones[c::last_station_idx_first]];
+  auto const last_station_range_l2 = hithandler.zonerange[UVzones[c::last_station_idx_last]];
+  auto const last_station_range_l3 = hithandler.zonerange[Xzones[c::last_station_idx_last]];
 
-          mydebug( S3Tx, betterZ[11] - betterZ[8], hithandler.hits[selectS3L0], hithandler.hits[selectS3L3], S3x1pred,
-                   S3x2pred );
-          mydebug( hithandler.hits[startS3L1 + 1], hithandler.hits[startS3L2 - 2] );
-          mydebug( LHCb::LHCbID( hithandler.IDs[startS3L1 + 1] ), LHCb::LHCbID( hithandler.IDs[startS3L2 - 2] ) );
-          mydebug( hithandler.hits[startS3L2 + 1], hithandler.hits[startS3L0 - 2] );
-          mydebug( LHCb::LHCbID( hithandler.IDs[startS3L2 + 1] ), LHCb::LHCbID( hithandler.IDs[startS3L0 - 2] ) );
+  // z delta between the two x-layers of the seeding station
+  float const zdelta_seed_doublet =
+      cache.LayerZPos[up_down_offset + 3 + c::seed_first_layer_z_idx] - seed_station_z_pos_l0;
 
-          auto const [selectS3L1, DeltaS3L1] = SearchLayerForHitLinear( hithandler.hits, startoffsetS3L1, S3x1pred );
-          startoffsetS3L1                    = selectS3L1;
+  // inverse of the above quantity
+  float const inv_zdelta_seed_doublet = 1.f / zdelta_seed_doublet;
+
+  // delta in Z between kink position in magnet and z position of seed station
+  float const inv_zdelta_kink_station = 1.f / ( seed_station_z_pos_l0 - c::zPosKinkMagnet );
+
+  // Loop over the hits in the first x-layer within the search window
+  for ( auto idxL0{start_s3_l0}; idxL0 < end_s3_l0; ++idxL0 ) {
 
-          auto const [selectS3L2, DeltaS3L2] = SearchLayerForHitLinear( hithandler.hits, startoffsetS3L2, S3x2pred );
-          startoffsetS3L2                    = selectS3L2;
+    mydebug( "#####################Start new hit################################" );
 
-          float const S3uvwindow = ( p_UV3W_b_2ndloop + p_UV3W_m_2ndloop * absDSlope );
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
+    //                  Start of search in the first station               //
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
 
-          if ( DeltaS3L1 > S3uvwindow and DeltaS3L2 > S3uvwindow ) { continue; }
+    // get the x value of hit
+    float const firstXhit = hithandler.hits[idxL0];
 
-          if ( DeltaS3L2 < S3uvwindow ) {
-            tmpidvec.push_back( selectS3L2 );
-            ++numuvhits;
-          }
+    // calculate a rough estimate of the expected slope,
+    // depending on the current iteration
+    float const tx = ( firstXhit - x_at_magnet ) * inv_zdelta_kink_station;
 
-          if ( DeltaS3L1 < S3uvwindow ) {
-            tmpidvec.push_back( selectS3L1 );
-            ++numuvhits;
-          }
+    // this is a small correction mainly for lower momentum tracks for which the zkink isn't perfect
+    // and they'll have some bending. This is solely empirical.
+    float const doubletCurve =
+        ( tx - endv_tx ) *
+        ( c::doublet_curve_c0 + ( endv_tx * c::doublet_curve_c1 + tx * c::doublet_curve_c2 ) * ( tx - endv_tx ) );
 
-          if ( DeltaS3L1 < S3uvwindow and DeltaS3L2 < S3uvwindow ) {
-            S3UVDelta += std::abs( hithandler.hits[selectS3L1] + hithandler.hits[selectS3L2] - S3x1pred - S3x2pred );
-          }
+    // piece together the x-prediction in the last x layer
+    float const secondXhitPred = firstXhit + tx * zdelta_seed_doublet + doubletCurve;
 
-        } // end of the UV work for the second iteration
+    mydebug( "firstXhit", firstXhit, "lhcbid", hithandler.IDs[idxL0], "secondXhitPred", secondXhitPred, "tx", tx );
+    mydebug( doubletCurve, inv_zdelta_kink_station, seed_station_z_pos_l0, c::zPosKinkMagnet, zdelta_seed_doublet );
 
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-        //                      Start of search in Station 1                   //
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
+    start_seed_station_l3 = get_closest_hit_idx( hithandler.hits, start_seed_station_l3, secondXhitPred );
+    auto const delta      = std::abs( hithandler.hits[start_seed_station_l3] - secondXhitPred );
 
-        mydebug( "Start Station 1 Work" );
+    // x position of the second hit of the doublet
+    float const secondXhit = hithandler.hits[start_seed_station_l3];
 
-        S1buffer += 1;
+    // calculate x-slope of the found doublet
+    float const newtx = ( secondXhit - firstXhit ) * inv_zdelta_seed_doublet;
 
-        float const S1x3curve =
-            deltaSlope * ExtFac_alpha_S1L3 + deltaSlope * ExtFac_beta_S1L3 + deltaSlope * ExtFac_gamma_S1L3 * endv_ty2;
+    // deltaSlope is proportional to the momentum and used in many places to tune momentum dependant search windows
+    float const deltaSlope = newtx - endv_tx;
 
-        // the layer of the seeding hit, thus its z position, depends on the current iteration
-        float const S1x3pred = ( iteration == 0 ) ? firstXhit + newtx * ( betterZ[3] - betterZ[8] ) + S1x3curve
-                                                  : firstXhit + newtx * ( betterZ[3] - betterZ[4] ) + S1x3curve;
+    // the minimum value of 0.011 avoids too small search windows for tracks with P > 100 GeV
+    float const absDSlope = std::max( 0.011f, std::abs( deltaSlope ) );
+    mydebug( "absdslope", absDSlope );
 
-        mydebug( "S1X3straight", firstXhit + newtx * ( betterZ[3] - betterZ[4] ), "S1X3curve", S1x3curve, "pred",
-                 S1x3pred );
+    // check if the x-hit is within our search window acceptance, depending on the current iteration
+    if ( delta > std::min( c::doublet_win_off + c::doublet_win_slope * absDSlope, c::doublet_win_max ) ) continue;
 
-        auto const [selectS1L3, DeltaS1L3] =
-            SearchLayerForHitBinary( hithandler.hits, S1L3range.first, S1L3range.first + S1L3range.second, S1x3pred );
+    // we have a doublet and are going to try and build a track
+    // for that we need a couple tmp objects to keep track of stuff
+    // tmpidvec will store the indices of the hit to refind it later
+    boost::container::static_vector<int, 12> tmpidvec{start_seed_station_l3, idxL0};
+
+    // this is a vector that will remember some values for the linear fit performed at the very end.
+    // for each hit this will store 4 values, [xvalue, zdelta, curvature, residual]
+    // where zdelta is the distance between the hit and the hit in layer 0 of the seeding station
+    // curvature is the non-linear part of the prediction. See x3curve variable for example
+    std::array<float, 24> fitvec{firstXhit, 0, 0, 0, secondXhit, zdelta_seed_doublet, doubletCurve, 0};
 
-        bool S1Doublet = true;
+    // fitcnt simply keeps track of what's in the array
+    int fitcnt = 8;
+
+    float const direction = -1.f * std::copysign( 1.f, deltaSlope );
 
-        mydebug( "selectS1L3", selectS1L3 );
-        mydebug( "delta S1L3: ", DeltaS1L3, hithandler.IDs[selectS1L3] );
+    // correct the straight line estimate of the y-position at station 3 layer 0
+    float const ycorr =
+        absDSlope * ( direction * deltaYParams[0] +
+                      endv_ty * ( deltaYParams[1] + deltaYParams[5] * endv_ty2 +
+                                  endv_tx * ( direction * ( deltaYParams[2] + deltaYParams[6] * endv_ty2 ) +
+                                              endv_tx * ( deltaYParams[3] + deltaYParams[7] * endv_ty2 +
+                                                          deltaYParams[4] * direction * endv_tx ) ) ) );
 
-        // threshold of xhits in S1 and S2 to reject a candidate, depending on the iteration
-        // this is put in OR with the (numuvhits < 3) condition
-        int const numxhits_thr = ( iteration == 0 ) ? 4 : static_cast<int>( p_numXHitsThrS1S2_2ndloop );
+    // keep track of how many hits we find
+    int numuvhits{0};
+    int numxhits{2};
 
-        // threshold on DeltaS1L3, depending on the iteration
-        float const DeltaS1L3_thr =
-            ( iteration == 0 ) ? p_S1L3W_b + p_S1L3W_m * absDSlope : p_S1L3W_b_2ndloop + p_S1L3W_m_2ndloop * absDSlope;
+    // calculate the x-predictions for the U V layers in station 3
+    float const seed_l1_pred =
+        firstXhit +
+        newtx * ( cache.LayerZPos[up_down_offset + 1 + c::seed_first_layer_z_idx] - seed_station_z_pos_l0 ) +
+        ( yAtZ[1 + c::seed_first_layer_z_idx] - c::seed_scale_ycorr * ycorr ) *
+            cache.dXdYFiber[up_down_dxdy_offset + c::seed_first_layer_z_idx / 2];
 
-        if ( DeltaS1L3 < DeltaS1L3_thr ) {
-          tmpidvec.push_back( selectS1L3 );
-          ++numxhits;
+    mydebug( newtx, ( cache.LayerZPos[up_down_offset + 1 + c::seed_first_layer_z_idx] - seed_station_z_pos_l0 ),
+             ( yAtZ[1 + c::seed_first_layer_z_idx] - c::seed_scale_ycorr * ycorr ),
+             cache.dXdYFiber[up_down_dxdy_offset + c::seed_first_layer_z_idx / 2] );
 
-          // the delta z wrt the seeding layer depends on the current iteration
-          if ( iteration == 0 )
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS1L3], betterZ[3] - betterZ[8], S1x3curve,
-                        hithandler.hits[selectS1L3] - S1x3pred );
-          else
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS1L3], betterZ[3] - betterZ[4], S1x3curve,
-                        hithandler.hits[selectS1L3] - S1x3pred );
+    float const seed_l2_pred =
+        firstXhit +
+        newtx * ( cache.LayerZPos[up_down_offset + 2 + c::seed_first_layer_z_idx] - seed_station_z_pos_l0 ) +
+        ( yAtZ[2 + c::seed_first_layer_z_idx] - ( c::seed_scale_ycorr + 0.02f ) * ycorr ) *
+            cache.dXdYFiber[up_down_dxdy_offset + 1 + c::seed_first_layer_z_idx / 2];
 
-          mydebug( "Hit Selected" );
-        } else {
-          S1Doublet = false;
-          if ( numxhits < numxhits_thr or numuvhits < 3 ) continue;
-        }
+    mydebug( newtx, ( cache.LayerZPos[up_down_offset + 2 + c::seed_first_layer_z_idx] - seed_station_z_pos_l0 ),
+             ( yAtZ[2 + c::seed_first_layer_z_idx] - ( c::seed_scale_ycorr + 0.02f ) * ycorr ),
+             cache.dXdYFiber[up_down_dxdy_offset + 1 + c::seed_first_layer_z_idx / 2] );
 
-        float const S1x0curve =
-            ExtFac_alpha_S1L0 * deltaSlope + ExtFac_beta_S1L0 * deltaSlope + deltaSlope * ExtFac_gamma_S1L0 * endv_ty2;
+    // search for hits in layer
+    start_seed_station_l1    = get_closest_hit_idx( hithandler.hits, start_seed_station_l1, seed_l1_pred );
+    auto const seed_delta_l1 = hithandler.hits[start_seed_station_l1] - seed_l1_pred;
 
-        // the layer of the seeding hit, thus its z position, depends on the current iteration
-        float const S1x0pred = ( iteration == 0 ) ? firstXhit + newtx * ( betterZ[0] - betterZ[8] ) + S1x0curve
-                                                  : firstXhit + newtx * ( betterZ[0] - betterZ[4] ) + S1x0curve;
+    start_seed_station_l2    = get_closest_hit_idx( hithandler.hits, start_seed_station_l2, seed_l2_pred );
+    auto const seed_delta_l2 = hithandler.hits[start_seed_station_l2] - seed_l2_pred;
 
-        mydebug( "S1x0straight", firstXhit + newtx * ( betterZ[0] - betterZ[4] ), "S1x0curve", S1x0curve, "pred",
-                 S1x0pred );
+    mydebug( "deltas x1, x2: ", seed_delta_l1, seed_delta_l2, hithandler.IDs[start_seed_station_l1],
+             hithandler.IDs[start_seed_station_l2] );
 
-        auto const [selectS1L0, DeltaS1L0] =
-            SearchLayerForHitBinary( hithandler.hits, S1L0range.first, S1L0range.first + S1L0range.second, S1x0pred );
+    // calculate acceptance window for uv hits based on momentum
+    float const seed_station_uvwindow = ( c::seed_uv_win_off + c::seed_uv_win_slope * absDSlope );
 
-        mydebug( "selectS1L0", selectS1L0, "delta S1L0: ", DeltaS1L0, hithandler.IDs[selectS1L0] );
+    if ( std::abs( seed_delta_l2 ) < seed_station_uvwindow ) {
+      tmpidvec.push_back( start_seed_station_l2 );
+      ++numuvhits;
+    }
 
-        // to make the threshold on DeltaS1L0 depending on the iteration
-        float const DeltaS1L0_thr =
-            ( iteration == 0 ) ? p_S1L0W_b + p_S1L0W_m * absDSlope : p_S1L0W_b_2ndloop + p_S1L0W_m_2ndloop * absDSlope;
+    if ( std::abs( seed_delta_l1 ) < seed_station_uvwindow ) {
+      tmpidvec.push_back( start_seed_station_l1 );
+      ++numuvhits;
+    } else if ( numuvhits < 1 ) {
+      // didn't find any uv hits let's try our luck in the next loop iteration
+      continue;
+    }
 
-        if ( DeltaS1L0 < DeltaS1L0_thr ) {
-          tmpidvec.push_back( selectS1L0 );
-          ++numxhits;
+    // this variable is saved for later as it is quite good for ghost rejection
+    float const seed_uv_delta = numuvhits < 2 ? 1.0f : 1.0f + std::abs( seed_delta_l1 + seed_delta_l2 );
 
-          // the delta z wrt the seeding layer depends on the current iteration
-          if ( iteration == 0 )
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS1L0], betterZ[0] - betterZ[8], S1x0curve,
-                        hithandler.hits[selectS1L0] - S1x0pred );
-          else
-            fillFitVec( fitvec, fitcnt, hithandler.hits[selectS1L0], betterZ[0] - betterZ[4], S1x0curve,
-                        hithandler.hits[selectS1L0] - S1x0pred );
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
+    //                  Start of search in the second station              //
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
+    mydebug( "starting next station" );
 
-          mydebug( "Hit Selected" );
-        } else {
-          S1Doublet = false;
-          if ( numxhits < ( numxhits_thr + 1 ) or numuvhits < 3 ) continue;
-        }
+    // these variables will stay true if x-hits are found in both layers
+    bool next_station_doublet = true;
 
-        if ( iteration == 0 ) {
-          ++CounterXDoubletsS3S2S1;
-        } else {
-          ++CounterXDoubletsS3S2S1_2ndloop;
-        }
+    float const next_l0_curve = cache.ext_coeff[c::ext_coeff_start + up_down_offset + 6] * deltaSlope +
+                                cache.ext_coeff[c::ext_coeff_start + up_down_offset + 7] * deltaSlope +
+                                cache.ext_coeff[c::ext_coeff_start + up_down_offset + 8] * deltaSlope * endv_ty2;
 
-        float S1UVDelta = 1.0f;
+    float const next_station_z_pos_l0 = cache.LayerZPos[up_down_offset + c::next_first_layer_z_idx];
+    float const next_l0_pred = firstXhit + newtx * ( next_station_z_pos_l0 - seed_station_z_pos_l0 ) + next_l0_curve;
 
-        if ( S1Doublet ) {
-          S1UVbuffer += 1;
-          float const S1Tx =
-              ( hithandler.hits[selectS1L3] - hithandler.hits[selectS1L0] ) / ( betterZ[3] - betterZ[0] );
+    float const next_l3_curve = deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 9] +
+                                deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 10] +
+                                deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 11] * endv_ty2;
 
-          float const S1x1pred = hithandler.hits[selectS1L0] + S1Tx * ( betterZ[1] - betterZ[0] ) +
-                                 ( yAtZ[1] - .62f * ycorr ) * dXdYLayer[0];
+    float const next_station_z_pos_l3 = cache.LayerZPos[up_down_offset + c::next_first_layer_z_idx + 3];
+    float       next_l3_pred = firstXhit + newtx * ( next_station_z_pos_l3 - seed_station_z_pos_l0 ) + next_l3_curve;
 
-          float const S1x2pred = hithandler.hits[selectS1L0] + S1Tx * ( betterZ[2] - betterZ[0] ) +
-                                 ( yAtZ[2] - .63f * ycorr ) * dXdYLayer[1];
+    // get the best hit candidate
+    start_next_station_l3    = get_closest_hit_idx( hithandler.hits, start_next_station_l3, next_l3_pred );
+    auto const next_delta_l3 = hithandler.hits[start_next_station_l3] - next_l3_pred;
 
-          auto const [selectS1L1, DeltaS1L1] =
-              SearchLayerForHitBinary( hithandler.hits, S1L1range.first, S1L1range.first + S1L1range.second, S1x1pred );
+    mydebug( "next l3 pred", next_l3_pred, "curve", next_l3_curve, "delta", next_delta_l3, "idx",
+             start_next_station_l3 );
 
-          auto const [selectS1L2, DeltaS1L2] =
-              SearchLayerForHitBinary( hithandler.hits, S1L2range.first, S1L2range.first + S1L2range.second, S1x2pred );
+    // did we find a hit?
+    if ( std::abs( next_delta_l3 ) < ( c::next_l3_win_off + c::next_l3_win_slope * absDSlope ) ) {
+      mydebug( "hit accepted" );
 
-          // the UV window value depends on the current iteration
-          float const S1uvwindow = ( iteration == 0 ) ? ( p_UV1W_b + p_UV1W_m * absDSlope )
-                                                      : ( p_UV1W_b_2ndloop + p_UV1W_m_2ndloop * absDSlope );
+      tmpidvec.push_back( start_next_station_l3 );
+      ++numxhits;
 
-          if ( DeltaS1L1 > S1uvwindow and std::abs( DeltaS1L2 ) > S1uvwindow ) { continue; }
+      // fill fit vec with the x-hit information
+      fillFitVec( fitvec, fitcnt, hithandler.hits[start_next_station_l3], next_station_z_pos_l3 - seed_station_z_pos_l0,
+                  next_l3_curve, next_delta_l3 );
+    } else {
+      // we didn't find a hit and thus also not a doublet
+      next_station_doublet = false;
 
-          if ( DeltaS1L2 < S1uvwindow ) {
-            tmpidvec.push_back( selectS1L2 );
-            ++numuvhits;
-          }
+      // do we even really want to continue with this track or is it crap?
+      if ( numxhits + numuvhits < 4 ) continue;
+    }
 
-          if ( DeltaS1L1 < S1uvwindow ) {
-            tmpidvec.push_back( selectS1L1 );
-            ++numuvhits;
-          }
+    start_next_station_l0    = get_closest_hit_idx( hithandler.hits, start_next_station_l0, next_l0_pred );
+    auto const next_delta_l0 = hithandler.hits[start_next_station_l0] - next_l0_pred;
 
-          if ( DeltaS1L1 < S1uvwindow and DeltaS1L2 < S1uvwindow )
-            S1UVDelta += std::abs( hithandler.hits[selectS1L1] + hithandler.hits[selectS1L2] - S1x1pred - S1x2pred );
-        }
+    mydebug( "next l0 pred", next_l0_pred, "curve", next_l0_curve, "delta", next_delta_l0, "idx",
+             start_next_station_l0 );
 
-        mydebug( "End Station 1 Work" );
-
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-        //                      Start of finalization procedure                //
-        /////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////
-
-        // quick linear fit in x0 and tx, curvature terms remain fixed
-        float s0  = 0;
-        float sz  = 0;
-        float sz2 = 0;
-        float sd  = 0;
-        float sdz = 0;
-
-        for ( int idx{0}; idx < fitcnt; idx += 4 ) {
-          // for now the weight is just 1
-          s0 += 1;
-          sz += fitvec[idx + 1];
-          sz2 += fitvec[idx + 1] * fitvec[idx + 1];
-          sd += fitvec[idx + 3];
-          sdz += fitvec[idx + 3] * fitvec[idx + 1];
-        }
+    // did we find a hit?
+    if ( std::abs( next_delta_l0 ) < ( c::next_l0_win_off + c::next_l0_win_slope * absDSlope ) ) {
+      mydebug( "hit accepted" );
+      tmpidvec.push_back( start_next_station_l0 );
+      ++numxhits;
+
+      fillFitVec( fitvec, fitcnt, hithandler.hits[start_next_station_l0], next_station_z_pos_l0 - seed_station_z_pos_l0,
+                  next_l0_curve, next_delta_l0 );
+    } else {
+      next_station_doublet = false;
+      if ( numxhits + numuvhits < 5 ) continue;
+    }
 
-        float const den     = 1.f / ( sz * sz - 6 * sz2 );
-        float const xoff    = ( sdz * sz - sd * sz2 ) * den;
-        float const txoff   = ( sd * sz - 6 * sdz ) * den;
-        float const finaltx = newtx + txoff;
-        float const newx0   = firstXhit + xoff;
-
-        // calculate a chi2 after the fit
-        float chi2 = ( fitvec[0] - newx0 ) * ( fitvec[0] - newx0 );
-        for ( int idx{4}; idx < fitcnt; idx += 4 ) {
-          // for now the weight is just 1
-          float const diff = fitvec[idx] - ( newx0 + fitvec[idx + 1] * finaltx + fitvec[idx + 2] );
-          chi2 += diff * diff;
+    // we only check for uv hits if we found a doublet
+    // this choice was made since a doublet and it's slope allow for small search windows
+    // FIXME we could potentially allow this search in other/all scenarios but this need more tweaking
+    // Note that this search will only help with ghost rejection, not help with efficiencies
+    // Thus as long as ghosts aren't a problem there isn't really a big reason to loosen this requirement
+    float next_uv_delta{1.0f};
+
+    if ( next_station_doublet ) {
+      mydebug( "next uv search" );
+
+      float const next_tx = ( hithandler.hits[start_next_station_l3] - hithandler.hits[start_next_station_l0] ) /
+                            ( next_station_z_pos_l3 - next_station_z_pos_l0 );
+
+      float const next_l1_pred =
+          hithandler.hits[start_next_station_l0] +
+          next_tx * ( cache.LayerZPos[up_down_offset + c::next_first_layer_z_idx + 1] - next_station_z_pos_l0 ) +
+          ( yAtZ[1 + c::next_first_layer_z_idx] - c::next_scale_ycorr * ycorr ) *
+              cache.dXdYFiber[up_down_dxdy_offset + c::next_first_layer_z_idx / 2];
+
+      float const next_l2_pred =
+          hithandler.hits[start_next_station_l0] +
+          next_tx * ( cache.LayerZPos[up_down_offset + c::next_first_layer_z_idx + 2] - next_station_z_pos_l0 ) +
+          ( yAtZ[2 + c::next_first_layer_z_idx] - ( c::next_scale_ycorr + 0.02f ) * ycorr ) *
+              cache.dXdYFiber[up_down_dxdy_offset + 1 + c::next_first_layer_z_idx / 2];
+
+      start_next_station_l1    = get_closest_hit_idx( hithandler.hits, start_next_station_l1, next_l1_pred );
+      auto const next_delta_l1 = hithandler.hits[start_next_station_l1] - next_l1_pred;
+
+      start_next_station_l2    = get_closest_hit_idx( hithandler.hits, start_next_station_l2, next_l2_pred );
+      auto const next_delta_l2 = hithandler.hits[start_next_station_l2] - next_l2_pred;
+
+      mydebug( "next l1 pred", next_l1_pred, "delta", next_delta_l1, "idx", start_next_station_l1 );
+      mydebug( "next l2 pred", next_l2_pred, "delta", next_delta_l2, "idx", start_next_station_l2 );
+
+      float const S2uvwindow = ( c::next_uv_win_off + c::next_uv_win_slope * absDSlope );
+
+      if ( std::abs( next_delta_l1 ) > S2uvwindow and std::abs( next_delta_l2 ) > S2uvwindow ) { continue; }
+
+      if ( std::abs( next_delta_l1 ) < S2uvwindow and std::abs( next_delta_l2 ) < S2uvwindow ) {
+        next_uv_delta += std::abs( next_delta_l1 + next_delta_l2 );
+        tmpidvec.push_back( start_next_station_l2 );
+        ++numuvhits;
+        tmpidvec.push_back( start_next_station_l1 );
+        ++numuvhits;
+      } else {
+        if ( std::abs( next_delta_l2 ) < S2uvwindow ) {
+          tmpidvec.push_back( start_next_station_l2 );
+          ++numuvhits;
         }
 
-        // determine momentum * charge estimate, polynomial was fitted on MC
-        const float finaltx2 = finaltx * finaltx;
-        const float coef =
-            ( MomentumParams[0] + finaltx2 * ( MomentumParams[1] + MomentumParams[2] * finaltx2 ) +
-              MomentumParams[3] * finaltx * endv_tx + endv_ty2 * ( MomentumParams[4] + MomentumParams[5] * endv_ty2 ) +
-              MomentumParams[6] * endv_tx2 );
-
-        float const candPQ = ( iteration == 0 ? 1.f : 0.9818f ) * m_magscalefactor * coef / ( finaltx - endv_tx ) +
-                             factor * MomentumParams[7];
-
-        mydebug( finaltx, endv_tx, endv_ty2, factor, coef, candPQ );
-
-        // the condition evaluates to true if the track is a charge flipped one
-        float deltaMom = ( endv_pt > p_wrongSignPT and ( factor * ( firstXhit - straighExt ) < 0 ) )
-                             ? 3.f * std::abs( endv_qp * ( endv_pq + candPQ ) )
-                             : std::abs( endv_qp * ( endv_pq - candPQ ) );
-
-        // counteract worse momentum resolution at higher momenutm by scaling down the observed delta
-        if ( std::abs( candPQ ) > 40000 ) deltaMom *= 40000.f / std::abs( candPQ );
-
-        // For now a linear discriminant seems to do well enough. But if needed this could be substituted by a neural
-        // net for better rejection of ghosts
-        float const quality =
-            LDAParams[0] * approx_log( SIMDWrapper::scalar::float_v( 1.f + deltaMom ) ).cast() +
-            LDAParams[1] * approx_log( SIMDWrapper::scalar::float_v( 1.f + chi2 ) ).cast() +
-            LDAParams[2] *
-                approx_log( SIMDWrapper::scalar::float_v( std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ) ) )
-                    .cast() +
-            LDAParams[3] * approx_log( SIMDWrapper::scalar::float_v( S3UVDelta ) ).cast() +
-            LDAParams[4] * approx_log( SIMDWrapper::scalar::float_v( S2UVDelta ) ).cast() +
-            LDAParams[5] * approx_log( SIMDWrapper::scalar::float_v( S1UVDelta ) ).cast() +
-            LDAParams[6] * static_cast<float>( numuvhits ) + LDAParams[7] * static_cast<float>( numxhits );
-
-        // float const quality =
-        // LDAParams[0] * vdt::fast_logf( 1.f + deltaMom ) + LDAParams[1] * vdt::fast_logf( 1.f + chi2 ) +
-        // LDAParams[2] * vdt::fast_logf( std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ) ) +
-        // LDAParams[3] * vdt::fast_logf( S3UVDelta ) + LDAParams[4] * vdt::fast_logf( S2UVDelta ) +
-        // LDAParams[5] * vdt::fast_logf( S1UVDelta ) + LDAParams[6] * static_cast<float>( numuvhits ) +
-        // LDAParams[7] * static_cast<float>( numxhits );
-
-        mydebug( "+++++++++Trackcandidate++++++++++++++++" );
-        mydebug( candPQ, finaltx, endv_pq );
-        mydebug( "deltaMom", deltaMom, "chi2", chi2, "pt", std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ),
-                 "S3UVDelta", S3UVDelta );
-        mydebug( "final quality", quality, "worst candidate", bestcandidate.quality );
-
-        if ( iteration == 0 ) {
-          ++CounterCandsBeforeQuality;
-        } else {
-          ++CounterCandsBeforeQuality_2ndloop;
+        if ( std::abs( next_delta_l1 ) < S2uvwindow ) {
+          tmpidvec.push_back( start_next_station_l1 );
+          ++numuvhits;
         }
+      }
 
-        if ( quality < bestcandidate.quality ) {
-          bestcandidate = {tmpidvec, candPQ, finaltx, newx0, quality, numuvhits + numxhits};
+    } // end of the UV work in the first iteration
 
-          // we got a new candidate!
-          if ( iteration == 0 ) {
-            ++CounterCandsAfterQuality;
-          } else {
-            ++CounterCandsAfterQuality_2ndloop;
-          }
-        }
-      } // loop over possible station three doublets
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
+    //                      Start of search in Station 1                   //
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
 
-      mydebug( "Search done!" );
+    mydebug( "Start Station 1 Work" );
 
-      // save the bestcandidate if we built one
-      if ( !bestcandidate.ids.empty() ) {
-        int i    = Output.size();
-        int mask = true;
+    float const last_l3_curve = deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 3] +
+                                deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 4] +
+                                deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 5] * endv_ty2;
 
-        ++CounterCandsBuffer;
+    // the layer of the seeding hit, thus its z position, depends on the current iteration
+    float const last_station_z_pos_l3 = cache.LayerZPos[up_down_offset + c::last_first_layer_z_idx + 3];
+    float const last_l3_pred = firstXhit + newtx * ( last_station_z_pos_l3 - seed_station_z_pos_l0 ) + last_l3_curve;
 
-        Output.compressstore_trackVP<I>( i, mask, tracks.trackVP<I>( uttrack ) );
-        Output.compressstore_trackUT<I>( i, mask, uttrack );
+    auto const last_l3_hit_idx =
+        last_station_range_l3.first +
+        get_closest_hit_idx_binary(
+            {hithandler.hits.data() + last_station_range_l3.first, last_station_range_l3.second}, last_l3_pred );
+    auto const last_delta_l3 = hithandler.hits[last_l3_hit_idx] - last_l3_pred;
 
-        float const qop = 1.f / bestcandidate.PQ;
-        Output.compressstore_stateQoP<F>( i, mask, qop );
+    mydebug( "last_straight", firstXhit + newtx * ( last_station_z_pos_l3 - seed_station_z_pos_l0 ), "last_l3_curve",
+             last_l3_curve, "pred", last_l3_pred, "delta", last_delta_l3, "idx", last_l3_hit_idx );
 
-        int n_hits = 0;
+    bool last_station_doublet = true;
 
-        for ( auto idx{bestcandidate.ids.begin()}; idx != bestcandidate.ids.end(); ++idx, ++n_hits ) {
-          Output.compressstore_hit<I>( i, n_hits, mask, hithandler.IDs[*idx] );
-        }
+    if ( std::abs( last_delta_l3 ) < c::last_l3_win_off + c::last_l3_win_slope * absDSlope ) {
+      tmpidvec.push_back( last_l3_hit_idx );
+      ++numxhits;
 
-        Output.compressstore_nHits<I>( i, mask, bestcandidate.numHits );
+      // the delta z wrt the seeding layer depends on the current iteration
+      fillFitVec( fitvec, fitcnt, hithandler.hits[last_l3_hit_idx], last_station_z_pos_l3 - seed_station_z_pos_l0,
+                  last_l3_curve, last_delta_l3 );
+      mydebug( "Hit Selected" );
+    } else {
+      last_station_doublet = false;
+      if ( numxhits < 4 or numuvhits < 3 ) continue;
+    }
 
-        // AtT State
-        float const endT_z = cache.LayerZPos[8];
-        float const endT_x = bestcandidate.newx0;
-        auto        endT_y = endv_pos.y + endv_dir.y * ( endT_z - endv_pos.z );
+    float const last_l0_curve = deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 0] +
+                                deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 1] +
+                                deltaSlope * cache.ext_coeff[c::ext_coeff_start + up_down_offset + 2] * endv_ty2;
 
-        auto endT_pos = Vec3<F>( endT_x, endT_y, endT_z );
-        auto endT_dir = Vec3<F>( bestcandidate.finaltx, endv_dir.y, 1.f );
+    // the layer of the seeding hit, thus its z position, depends on the current iteration
+    float const last_station_z_pos_l0 = cache.LayerZPos[up_down_offset + c::last_first_layer_z_idx];
+    float const last_l0_pred = firstXhit + newtx * ( last_station_z_pos_l0 - seed_station_z_pos_l0 ) + last_l0_curve;
 
-        Output.compressstore_statePos<F>( i, mask, endT_pos );
-        Output.compressstore_stateDir<F>( i, mask, endT_dir );
+    auto const last_l0_hit_idx =
+        last_station_range_l0.first +
+        get_closest_hit_idx_binary(
+            {hithandler.hits.data() + last_station_range_l0.first, last_station_range_l0.second}, last_l0_pred );
+    auto const last_delta_l0 = hithandler.hits[last_l0_hit_idx] - last_l0_pred;
 
-        Output.size() += dType::popcount( mask );
+    mydebug( "last_straight", firstXhit + newtx * ( last_station_z_pos_l0 - seed_station_z_pos_l0 ), "last_l0_curve",
+             last_l0_curve, "pred", last_l0_pred, "delta", last_delta_l0, "idx", last_l0_hit_idx );
 
-        // if we found a candidate we won't start the second loop
-        break;
-      } // save the bestcandidate if we built one
+    if ( std::abs( last_delta_l0 ) < c::last_l0_win_off + c::last_l0_win_slope * absDSlope ) {
+      tmpidvec.push_back( last_l0_hit_idx );
+      ++numxhits;
 
-    } // end loop over iterations
-  }   // loop over the UT tracks
+      // the delta z wrt the seeding layer
+      fillFitVec( fitvec, fitcnt, hithandler.hits[last_l0_hit_idx], last_station_z_pos_l0 - seed_station_z_pos_l0,
+                  last_l0_curve, last_delta_l0 );
 
-  m_CounterOutput += Output.size();
-  return Output;
+      mydebug( "Hit Selected" );
+    } else {
+      last_station_doublet = false;
+      if ( numxhits < ( 5 ) or numuvhits < 3 ) continue;
+    }
+
+    float last_uv_delta = 1.0f;
+
+    if ( last_station_doublet ) {
+      mydebug( "last uv search" );
+      float const last_tx = ( hithandler.hits[last_l3_hit_idx] - hithandler.hits[last_l0_hit_idx] ) /
+                            ( last_station_z_pos_l3 - last_station_z_pos_l0 );
+
+      float const last_l1_pred =
+          hithandler.hits[last_l0_hit_idx] +
+          last_tx * ( cache.LayerZPos[up_down_offset + c::last_first_layer_z_idx + 1] - last_station_z_pos_l0 ) +
+          ( yAtZ[1 + c::last_first_layer_z_idx] - c::last_scale_ycorr * ycorr ) *
+              cache.dXdYFiber[up_down_dxdy_offset + c::last_first_layer_z_idx / 2];
+
+      float const last_l2_pred =
+          hithandler.hits[last_l0_hit_idx] +
+          last_tx * ( cache.LayerZPos[up_down_offset + c::last_first_layer_z_idx + 2] - last_station_z_pos_l0 ) +
+          ( yAtZ[2 + c::last_first_layer_z_idx] - ( c::last_scale_ycorr + 0.02f ) * ycorr ) *
+              cache.dXdYFiber[up_down_dxdy_offset + 1 + c::last_first_layer_z_idx / 2];
+
+      auto const last_l1_hit_idx =
+          last_station_range_l1.first +
+          get_closest_hit_idx_binary(
+              {hithandler.hits.data() + last_station_range_l1.first, last_station_range_l1.second}, last_l1_pred );
+      auto const last_delta_l1 = hithandler.hits[last_l1_hit_idx] - last_l1_pred;
+
+      auto const last_l2_hit_idx =
+          last_station_range_l2.first +
+          get_closest_hit_idx_binary(
+              {hithandler.hits.data() + last_station_range_l2.first, last_station_range_l2.second}, last_l2_pred );
+      auto const last_delta_l2 = hithandler.hits[last_l2_hit_idx] - last_l2_pred;
+
+      // the UV window value depends on the current iteration
+      float const last_uvwindow = c::last_uv_win_off + c::last_uv_win_slope * absDSlope;
+
+      mydebug( "last l1 pred", last_l1_pred, "delta", last_delta_l1, "idx", last_l1_hit_idx );
+      mydebug( "last l2 pred", last_l2_pred, "delta", last_delta_l2, "idx", last_l2_hit_idx );
+
+      if ( std::abs( last_delta_l1 ) > last_uvwindow and std::abs( last_delta_l2 ) > last_uvwindow ) {
+        mydebug( "none found;skipping" );
+        continue;
+      }
+      if ( std::abs( last_delta_l1 ) < last_uvwindow and std::abs( last_delta_l2 ) < last_uvwindow ) {
+        last_uv_delta += std::abs( last_delta_l1 + last_delta_l2 );
+        tmpidvec.push_back( last_l2_hit_idx );
+        ++numuvhits;
+        tmpidvec.push_back( last_l1_hit_idx );
+        ++numuvhits;
+      } else {
+
+        if ( std::abs( last_delta_l2 ) < last_uvwindow ) {
+          tmpidvec.push_back( last_l2_hit_idx );
+          ++numuvhits;
+        }
+
+        if ( std::abs( last_delta_l1 ) < last_uvwindow ) {
+          tmpidvec.push_back( last_l1_hit_idx );
+          ++numuvhits;
+        }
+      }
+    }
+
+    mydebug( "End Station 1 Work" );
+
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
+    //                      Start of finalization procedure                //
+    /////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////
+
+    mydebug( "+++++++++Trackcandidate++++++++++++++++" );
+
+    // mydebug( fitvec, fitcnt, newtx, firstXhit );
+    auto const [finaltx, newx0, chi2] = simple_fit( fitvec, fitcnt, newtx, firstXhit );
+    float const candPQ                = calc_mom<std::is_same_v<c, config_loop_1>>( finaltx, endv_tx, endv_ty2,
+                                                                     std::copysign( 1.f, endv_qp ) * factor, factor );
+    auto const  quality =
+        calc_quality( endv_pt, factor, firstXhit, straighExt, endv_pq, endv_qp, candPQ, p_wrongSignPT.value(), chi2,
+                      endv_txy, seed_uv_delta, next_uv_delta, last_uv_delta, numuvhits, numxhits );
+
+    if ( quality < bestcandidate.quality )
+      bestcandidate = {tmpidvec, candPQ, finaltx, newx0, quality, numuvhits + numxhits};
+
+    mydebug( candPQ, finaltx, endv_pq );
+    mydebug( seed_uv_delta, next_uv_delta, last_uv_delta );
+    mydebug( "chi2", chi2, "pt", std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ) );
+    mydebug( "final quality", quality, "worst candidate", bestcandidate.quality );
+  } // loop over possible station three doublets
+  return bestcandidate;
 }
+
-- 
GitLab


From 972b9f232a45f64340bc20ef66931c28c3a34972 Mon Sep 17 00:00:00 2001
From: Gitlab CI <noreply@cern.ch>
Date: Mon, 6 Apr 2020 15:55:27 +0000
Subject: [PATCH 2/3] Fixed formatting

patch generated by https://gitlab.cern.ch/lhcb/Rec/-/jobs/7889792
---
 Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
index e8ee273a399..3caf3e10c82 100644
--- a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
+++ b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
@@ -1204,4 +1204,3 @@ template <typename config>
   } // loop over possible station three doublets
   return bestcandidate;
 }
-
-- 
GitLab


From 20fe81a55d0a4d5f7b5c176d4217b1e8cd076b83 Mon Sep 17 00:00:00 2001
From: Christoph Hasse <hasse.christoph@outlook.de>
Date: Thu, 9 Apr 2020 18:47:59 +0200
Subject: [PATCH 3/3] try to work around clang bug

---
 Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
index 3caf3e10c82..fe148be0a2e 100644
--- a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
+++ b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
@@ -1187,8 +1187,8 @@ template <typename config>
     mydebug( "+++++++++Trackcandidate++++++++++++++++" );
 
     // mydebug( fitvec, fitcnt, newtx, firstXhit );
-    auto const [finaltx, newx0, chi2] = simple_fit( fitvec, fitcnt, newtx, firstXhit );
-    float const candPQ                = calc_mom<std::is_same_v<c, config_loop_1>>( finaltx, endv_tx, endv_ty2,
+    auto [finaltx, newx0, chi2] = simple_fit( fitvec, fitcnt, newtx, firstXhit );
+    float const candPQ          = calc_mom<std::is_same_v<c, config_loop_1>>( finaltx, endv_tx, endv_ty2,
                                                                      std::copysign( 1.f, endv_qp ) * factor, factor );
     auto const  quality =
         calc_quality( endv_pt, factor, firstXhit, straighExt, endv_pq, endv_qp, candPQ, p_wrongSignPT.value(), chi2,
-- 
GitLab