diff --git a/Phys/FunctorCore/tests/src/InstantiateFunctors.cpp b/Phys/FunctorCore/tests/src/InstantiateFunctors.cpp
index 144734b7ab5a0b02f137c9c48c0694ff67456866..e91d2c6e38313a03a70418161f5126cb2cb787d1 100644
--- a/Phys/FunctorCore/tests/src/InstantiateFunctors.cpp
+++ b/Phys/FunctorCore/tests/src/InstantiateFunctors.cpp
@@ -249,9 +249,9 @@ using m_3 = std::invoke_result_t<decltype( Functors::Composite::Mass ), soa_comp
 
 using l_1 =
     std::invoke_result_t<decltype( Functors::Composite::Lifetime ), LHCb::RecVertex const&, LHCb::Particle const&>;
-using l_1 = std::invoke_result_t<decltype( Functors::Composite::Lifetime ), LHCb::Event::v2::RecVertex const*,
+using l_2 = std::invoke_result_t<decltype( Functors::Composite::Lifetime ), LHCb::Event::v2::RecVertex const*,
                                  LHCb::Particle const*>;
-using l_1 = std::invoke_result_t<decltype( Functors::Composite::Lifetime ), LHCb::PrimaryVertex const&, soa_composite>;
+using l_3 = std::invoke_result_t<decltype( Functors::Composite::Lifetime ), LHCb::PrimaryVertex const&, soa_composite>;
 
 using dls_1 = std::invoke_result_t<decltype( Functors::Composite::ComputeDecayLengthSignificance ),
                                    LHCb::RecVertex const&, LHCb::Particle const&>;
diff --git a/Phys/SelTools/include/SelTools/DistanceCalculator.h b/Phys/SelTools/include/SelTools/DistanceCalculator.h
index 08b57050d8742e8ccbbe55e19ea5bf616795b36e..ebb186e1b60896dba5210d9e27429a43720e539e 100644
--- a/Phys/SelTools/include/SelTools/DistanceCalculator.h
+++ b/Phys/SelTools/include/SelTools/DistanceCalculator.h
@@ -192,6 +192,69 @@ namespace Sel {
 
   } // end of namespace helper
 
+  // The following functions provide the p4 and its cov matrix in the
+  // "(Px,Py,Pz,M)" representation. This is required if you'd like to
+  // do anything with the mass or covariance matrix in single
+  // precision.
+  namespace {
+    template <typename Particle>
+    auto fourMomentumPM( const Particle& p ) {
+      const auto mom = fourMomentum( p );
+      const auto m2  = mass2( p );
+      using float_v  = std::decay_t<decltype( mom( 0 ) )>;
+      return LHCb::LinAlg::Vec<float_v, 4>{ mom( 0 ), mom( 1 ), mom( 2 ), sqrt( m2 ) };
+    }
+
+    template <typename Particle>
+    auto dPMdPE( const Particle& p ) {
+      // create the PE->PM Jacobian
+      const auto mom = fourMomentumPM( p );
+      const auto e   = sqrt( LHCb::LinAlg::dot( mom, mom ) );
+      const auto m   = mom( 3 );
+      using float_v  = std::decay_t<decltype( m )>;
+      LHCb::LinAlg::Mat<float_v, 4, 4> J;
+      J( 0, 0 ) = J( 1, 1 ) = J( 2, 2 ) = 1;
+      J( 3, 0 )                         = -mom( 0 ) / m;
+      J( 3, 1 )                         = -mom( 1 ) / m;
+      J( 3, 2 )                         = -mom( 2 ) / m;
+      J( 3, 3 )                         = e / m;
+      return J;
+    }
+
+    template <typename Particle>
+    auto momPMCovMatrix( const Particle& p ) {
+      return similarity( dPMdPE( p ), momCovMatrix( p ) );
+    }
+
+    template <typename Particle>
+    auto momPMPosCovMatrix( const Particle& p ) {
+      return dPMdPE( p ) * momPosCovMatrix( p );
+    }
+
+    // To get this to work, we need to perform these computations in double precision,
+    // which now leads to some annoying code duplication
+    inline auto dPMdPE( const LHCb::Particle& p ) {
+      const auto                        mom = p.momentum();
+      const auto                        m   = mom.M();
+      ROOT::Math::SMatrix<double, 4, 4> J;
+      // LHCb::LinAlg::Mat<double, 4, 4> J ;
+      J( 0, 0 ) = J( 1, 1 ) = J( 2, 2 ) = 1;
+      J( 3, 0 )                         = -mom.Px() / m;
+      J( 3, 1 )                         = -mom.Py() / m;
+      J( 3, 2 )                         = -mom.Pz() / m;
+      J( 3, 3 )                         = mom.E() / m;
+      return J;
+    }
+    inline auto momPMCovMatrix( const LHCb::Particle& p ) {
+      const Gaudi::SymMatrix4x4 c = ROOT::Math::Similarity( dPMdPE( p ), p.momCovMatrix() );
+      return LHCb::LinAlg::convert<SIMDWrapper::scalar::float_v>( c ); // cannot convert expressions
+    }
+    inline auto momPMPosCovMatrix( const LHCb::Particle& p ) {
+      const ROOT::Math::SMatrix<double, 4, 3> c = dPMdPE( p ) * p.posMomCovMatrix();
+      return LHCb::LinAlg::convert<SIMDWrapper::scalar::float_v>( c ); // cannot convert expressions
+    }
+  } // namespace
+
   struct LifetimeFitter {
     // FIXME: remove default argument...
     LifetimeFitter() = default;
@@ -209,15 +272,17 @@ namespace Sel {
                               StatusCode::FAILURE );
       }
       m_no_convergence.emplace( m_owning_algorithm, "Lifetime fit did not converge. Aborting." );
-      m_negative_variance.emplace( m_owning_algorithm, "Negative variance produced in lifetime fit iteration." );
+      m_fit_failure.emplace( m_owning_algorithm, "Lifetime fit failure." );
+      m_dls_failure.emplace( m_owning_algorithm, "DLS failure" );
     }
 
   private:
     Gaudi::Algorithm*                                     m_owning_algorithm{ nullptr };
     mutable std::optional<Gaudi::Accumulators::Counter<>> m_no_convergence;
-    mutable std::optional<Gaudi::Accumulators::Counter<>> m_negative_variance;
+    mutable std::optional<Gaudi::Accumulators::Counter<>> m_fit_failure;
+    mutable std::optional<Gaudi::Accumulators::Counter<>> m_dls_failure;
 
-    static constexpr float NonPhysicalValue = std::numeric_limits<float>::quiet_NaN();
+    static constexpr auto NonPhysicalValue = std::numeric_limits<float>::quiet_NaN();
 
   public:
     /** @fn   DecayLengthSignificance
@@ -259,6 +324,8 @@ namespace Sel {
       }
 
       auto [success, W_Inv] = W.invChol();
+      const auto nfailures  = popcount( !success );
+      if ( m_dls_failure.has_value() && nfailures > 0 ) *m_dls_failure += nfailures;
       // Temporary fix: If W is not invertible due to neg Det, exclude 'a' terms.
       if ( !success ) {
         for ( size_t row = 0; row < 3; ++row ) {
@@ -299,13 +366,11 @@ namespace Sel {
       // LoKi calls the transported particle 'good'
       // auto [status, transported] = m_transporter->transport( p1, z, p2 );
       // LoKi fitter defines a 'decay' variable as particle.endVertex
-      auto ctau = ctau0( primary, particle );
-
+      auto ctau     = ctau0( primary, particle );
       using float_v = decltype( ctau );
       float_v error = -1.e+10 * Gaudi::Units::mm;
-      float_v chi2  = -1.e+10;
+      float_v chi2  = std::numeric_limits<float_v>::max();
       iterate( primary, particle, ctau, error, chi2 );
-
       auto lifetime = ctau / Gaudi::Units::c_light;
       error /= Gaudi::Units::c_light;
       return std::tuple{ lifetime, error, chi2 };
@@ -321,23 +386,21 @@ namespace Sel {
     auto iterate( VContainer const& primary, Particle const& particle, float_v& ctau, float_v& error,
                   float_v& chi2 ) const {
 
-      using LHCb::Event::endVertexPos;
-      using LHCb::Event::fourMomentum;
       using Sel::Utils::all;
       using std::abs;
       using std::isnan;
       using std::sqrt;
 
       // convergence parameters
-      const float_v delta_chi2 = 0.001;
-      const int     m_max_iter = 5;
-      const auto    ctau_init  = ctau;
+      const float_v max_delta_chi2    = 0.001;
+      const float_v max_reldelta_chi2 = 0.001;
+      const int     m_max_iter        = 5;
 
       // invariants which are not changed during iteration
-      const auto momCov        = momCovMatrix( particle );
+      const auto momCov        = momPMCovMatrix( particle );
       const auto posCov        = posCovMatrix( particle );
-      const auto momPosCov     = momPosCovMatrix( particle );
-      const auto initMom       = fourMomentum( particle );
+      const auto momPosCov     = momPMPosCovMatrix( particle );
+      const auto initMom       = fourMomentumPM( particle );
       const auto initPos       = endVertexPos( particle );
       const auto primaryPosCov = posCovMatrix( primary );
       const auto primaryPos    = endVertexPos( primary );
@@ -347,21 +410,28 @@ namespace Sel {
       auto decvertex  = initPos;
       auto primvertex = primaryPos;
 
-      auto converged = false;
+      // need to find a better way to get a mask with values false for both simd types, scalars and doubles
+      // auto converged = LHCb::type_map<float_v>::mask_v{false} ;
+      auto converged = float_v{ 1 } < 0;
+      auto success   = float_v{ 1 } > 0;
       for ( auto iter = 0; iter < m_max_iter; iter++ ) {
-        const auto& [new_ctau, new_chi2, new_error] =
-            ctau_step( primaryPos, primaryPosCov, initMom, initPos, momCov, posCov, momPosCov, momentum, decvertex,
-                       primvertex, ctau );
-        converged = all( abs( chi2 - new_chi2 ) < delta_chi2 );
-        ctau      = new_ctau;
-        chi2      = new_chi2;
-        error     = new_error;
-        if ( converged ) { break; }
+        const auto& [stepsuccess, new_chi2] = ctau_step( primaryPos, primaryPosCov, initMom, initPos, momCov, posCov,
+                                                         momPosCov, momentum, decvertex, primvertex, ctau, error );
+        const auto delta_chi2               = new_chi2 - chi2; // note: in first iteration chi2=limits::max<float> ;
+        success                             = success && stepsuccess;
+        converged                           = iter > 0 &&
+                    ( ( abs( delta_chi2 ) < max_delta_chi2 ) || ( abs( delta_chi2 ) < max_reldelta_chi2 * new_chi2 ) );
+        chi2 = new_chi2;
+        if ( all( converged || !stepsuccess ) ) { break; }
       }
-      // Temporary fix: In case of NaN, take initial value for ctau
-      float ctau_float = static_cast<float>( ctau );
-      ctau             = isnan( ctau_float ) ? ctau_init : ctau;
-      if ( !converged && m_no_convergence.has_value() ) { ++( *m_no_convergence ); }
+
+      const auto nfailures = popcount( !success );
+      if ( m_fit_failure.has_value() )
+        for ( int i = 0; i < nfailures; ++i ) ++( *m_fit_failure );
+
+      const auto nnonconverged = popcount( !converged );
+      if ( m_no_convergence.has_value() )
+        for ( int i = 0; i < nnonconverged; ++i ) ++( *m_no_convergence );
     }
 
     /** @fn  ctau_step
@@ -376,84 +446,70 @@ namespace Sel {
     template <typename MomCov, typename PosCov, typename MomPosCov, typename Vec4D, typename Vec3D, typename float_v>
     auto ctau_step( Vec3D const& primaryPos, PosCov const& primaryPosCov, Vec4D const& initMom, Vec3D const& initPos,
                     MomCov const& momCov, PosCov const& posCov, MomPosCov const& momPosCov, Vec4D& momentum,
-                    Vec3D& decvertex, Vec3D& primvertex, float_v const& ctau ) const {
+                    Vec3D& decvertex, Vec3D& primvertex, float_v& ctau, float_v& error ) const {
 
       using std::sqrt;
-
       auto const px = X( momentum );
       auto const py = Y( momentum );
       auto const pz = Z( momentum );
-      auto const e  = E( momentum );
-      auto const m2 = e * e - ( px * px + py * py + pz * pz );
-      auto const m  = sqrt( m2 );
+      auto const m  = E( momentum );
+      // auto const m2 = m*m ;
+      // auto const e  = sqrt(  px * px + py * py + pz * pz  + m2 ) ;
 
       // LoKi::Fitters::e_ctau
       auto const vec_E = LHCb::LinAlg::Vec{ px, py, pz } / m;
 
+      // This is the residual
+      auto const m_d = vec_E * ctau + ( primvertex - decvertex );
+
+      // This is the derivative of the residual to the momentum
       LHCb::LinAlg::Mat<float_v, 3, 4> mat_W;
-      mat_W( 0, 0 ) = ( 1.0 + px * px / m2 );
-      mat_W( 0, 1 ) = ( px * py / m2 );
-      mat_W( 0, 2 ) = ( px * pz / m2 );
-      mat_W( 1, 0 ) = mat_W( 0, 1 );
-      mat_W( 1, 1 ) = ( 1.0 + py * py / m2 );
-      mat_W( 1, 2 ) = ( py * pz / m2 );
-      mat_W( 2, 0 ) = mat_W( 0, 2 );
-      mat_W( 2, 1 ) = mat_W( 1, 2 );
-      mat_W( 2, 2 ) = ( 1.0 + pz * pz / m2 );
-
-      mat_W( 0, 3 ) = ( -px * e / m2 );
-      mat_W( 1, 3 ) = ( -py * e / m2 );
-      mat_W( 2, 3 ) = ( -pz * e / m2 );
+      mat_W( 0, 0 ) = 1.0;
+      mat_W( 1, 1 ) = 1.0;
+      mat_W( 2, 2 ) = 1.0;
+      mat_W( 0, 3 ) = -px / m;
+      mat_W( 1, 3 ) = -py / m;
+      mat_W( 2, 3 ) = -pz / m;
       mat_W         = mat_W * ctau / m;
 
-      auto const m_d         = vec_E * ctau + ( primvertex - decvertex );
       auto const mat_VD_part = mat_W * momPosCov;
       auto const mat_VD_tmp  = similarity( mat_W, momCov ) + posCov + primaryPosCov -
                               ( mat_VD_part + mat_VD_part.transpose() ).cast_to_sym();
-      auto const [success, mat_VD] = mat_VD_tmp.invChol();
+      auto const [invsuccess, mat_VD] = mat_VD_tmp.invChol();
 
       const auto m_Da0 = mat_W * ( initMom - momentum ) - ( initPos - decvertex ) + ( primaryPos - primvertex );
 
       const auto m_l0 = mat_VD * ( m_Da0 + m_d );
 
       const auto ctau_variance = 1. / similarity( vec_E, mat_VD );
-      if constexpr ( std::is_arithmetic_v<decltype( success )> ) {
-        if ( !success )
-          return std::tuple<float_v, float_v, float_v>{ NonPhysicalValue, NonPhysicalValue, NonPhysicalValue };
-        if ( ctau_variance < 0. ) {
-          if ( m_negative_variance.has_value() ) { ++( *m_negative_variance ); }
-          return std::tuple<float_v, float_v, float_v>{ NonPhysicalValue, NonPhysicalValue, NonPhysicalValue };
-        }
-      }
 
-      const auto delta_ctau = -ctau_variance * dot( vec_E, m_l0 );
+      const auto success    = invsuccess && ctau_variance > 0;
+      const auto delta_ctau = select( success, -ctau_variance * dot( vec_E, m_l0 ), 0 );
 
       const auto  m_D1 = momCov * mat_W.transpose() - momPosCov;
       const auto  m_D2 = ( mat_W * momPosCov ).transpose() - posCov;
       const auto& m_D3 = primaryPosCov;
 
-      const auto m_l = m_l0 + ( mat_VD * vec_E * delta_ctau );
+      // select doesn't work very well together with LinAlg
+      // const auto m_l = select(success,m_l0 + ( mat_VD * vec_E * delta_ctau ),LHCb::LinAlg::Vec<float_v, 3>{});
+      auto m_l = m_l0 + ( mat_VD * vec_E * delta_ctau );
+      for ( int i = 0; i < 3; ++i ) m_l( i ) = select( success, m_l( i ), float_v{ 0 } );
 
       const auto delta_momentum    = m_D1 * m_l * -1;
       const auto delta_decay_pos   = m_D2 * m_l * -1;
       const auto delta_primary_pos = m_D3 * m_l * -1;
 
       // Fill in 'output' values
-      auto const updated_ctau = ctau + delta_ctau;
-      auto const chi2         = LHCb::LinAlg::dot( m_l, m_Da0 + m_d );
-      auto const error        = sqrt( ctau_variance );
+      auto const chi2 = LHCb::LinAlg::dot( m_l, m_Da0 + m_d );
+      error           = sqrt( ctau_variance );
 
       // Update for the next iteration
+      ctau       = ctau + delta_ctau;
       momentum   = initMom + delta_momentum;
       decvertex  = initPos + delta_decay_pos;
       primvertex = primaryPos + delta_primary_pos;
 
-      if constexpr ( std::is_arithmetic_v<decltype( success )> ) {
-        return std::tuple{ updated_ctau, chi2, error };
-      } else {
-        return std::tuple{ select( success, updated_ctau, NonPhysicalValue ), select( success, chi2, NonPhysicalValue ),
-                           select( success, error, NonPhysicalValue ) };
-      }
+      return std::tuple{ success, chi2 };
     }
 
     /**  @brief Fast, approximate evaluation of c * tau.