[go: up one dir, main page]

File: cost.py

package info (click to toggle)
iminuit 2.30.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,660 kB
  • sloc: cpp: 14,591; python: 11,177; makefile: 11; sh: 5
file content (2600 lines) | stat: -rw-r--r-- 92,717 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
"""
Standard cost functions to minimize for statistical fits.

We provide these for convenience, so that you do not have to write your own for standard
fits. The cost functions optionally use Numba to accelerate some calculations, if Numba
is installed.

**There is no need** to set :attr:`iminuit.Minuit.errordef` manually for any of these
cost functions. :class:`iminuit.Minuit` automatically uses the correct value, which is
provided by each cost function with the attribute ``Cost.errordef``.

What to use when
----------------
- Fit a normalised probability density to data

    - Data are not binned: :class:`UnbinnedNLL`
    - Data are binned: :class:`BinnedNLL`, also supports histogram of weighted samples

- Fit a density to data, density is not normalised

    - Data are not binned: :class:`ExtendedUnbinnedNLL`
    - Data are binned: :class:`ExtendedBinnedNLL`, also supports
      histogram of weighted samples

- Fit a template to binned data with bin-wise uncertainties on the template

    - :class:`Template`, also supports weighted data and weighted template histograms

- Fit of a function f(x) to (x, y, yerror) pairs with normal-distributed fluctuations. x
  is one- or multi-dimensional, y is one-dimensional.

    - y values contain no outliers: :class:`LeastSquares`
    - y values contain outliers: :class:`LeastSquares` with loss function set to
      "soft_l1"

- Include constraints from external fits or apply regularisation

    - :class:`NormalConstraint`

Combining cost functions
------------------------
All cost functions can be added, which generates a new combined cost function.
Parameters with the same name are shared between component cost functions. Use this to
constrain one or several parameters with different data sets and using different
statistical models for each data set. Gaussian penalty terms can also be added to the
cost function to introduce external knowledge about a parameter.

Model parameter limits
----------------------
The Minuit algorithms support box constrains in parameter space. A user-defined model
can declare that a parameter is only valid over an interval on the real line with the
``Annotated`` type annotation, see :class:`iminuit.Minuit` for details. A typical
example is the sigma parameter of a normal distribution, which must be positive. The
cost functions defined here propagate this information to :class:`iminuit.Minuit`.

Note: The :class:`Template` declares that the template amplitudes must be non-negative,
which is usually the right choice, however, it may be desirable to fit templates which
can have negative amplitudes. To achieve this, simply reset the limits with
:attr:`iminuit.Minuit.limits` after creating the Minuit instance.

User-defined gradients
----------------------
If the user provides a model gradient, the cost functions defined here except
:class:`Template` will then also make their gradient available, which is then
automatically used by :class:`iminuit.Minuit` (see the constructor for details) to
potentially improve the fit (improve convergence  or robustness).

Note that it is perfectly normal to use Minuit without a user-defined gradient, and
Minuit does not always benefit from a user-defined gradient. If the gradient is
expensive to compute, the time to converge may increase. If you have trouble with the
fitting process, it is unlikely that the issues are resolved by a user-defined gradient.

Notes
-----
The cost functions defined here have been optimized with knowledge about implementation
details of Minuit to give the highest accucary and the most robust results, so they
should perform well. If you have trouble with your own implementations, try these.

The binned versions of the log-likelihood fits support weighted samples. For each bin of
the histogram, the sum of weights and the sum of squared weights is needed then, see
class documentation for details.
"""

from __future__ import annotations

from .util import (
    describe,
    merge_signatures,
    PerformanceWarning,
    _smart_sampling,
    _detect_log_spacing,
    is_positive_definite,
)
from .typing import Model, ModelGradient, LossFunction
import numpy as np
from numpy.typing import NDArray, ArrayLike
from collections.abc import Sequence as ABCSequence
import abc
from typing import (
    List,
    Tuple,
    Union,
    Sequence,
    Collection,
    Dict,
    Any,
    Iterable,
    Optional,
    TypeVar,
    Callable,
    cast,
)
import warnings
from ._deprecated import deprecated_parameter

__all__ = [
    "CHISQUARE",
    "NEGATIVE_LOG_LIKELIHOOD",
    "chi2",
    "multinomial_chi2",
    "poisson_chi2",
    "template_chi2_jsc",
    "template_chi2_da",
    "template_nll_asy",
    "Cost",
    "CostSum",
    "Constant",
    "BinnedNLL",
    "UnbinnedNLL",
    "ExtendedBinnedNLL",
    "ExtendedUnbinnedNLL",
    "Template",
    "LeastSquares",
]

T = TypeVar("T", float, NDArray)

CHISQUARE = 1.0
NEGATIVE_LOG_LIKELIHOOD = 0.5

_TINY_FLOAT = np.finfo(float).tiny


def log_or_zero(x):
    """
    Evaluate to log(x) for x > 0 and to 0 otherwise.

    Parameters
    ----------
    x : array
        Argument.

    Returns
    -------
    array
        Elementwise contains log(x) for x > 0 and zero otherwise.
    """
    # return 0 for x <= 0
    r = np.zeros_like(x)
    ma = x > 0
    r[ma] = np.log(x[ma])
    return r


def _unbinned_nll(x):
    # sorting makes sum more accurate, protect against x = 0
    return -np.sum(np.sort(np.log(x + _TINY_FLOAT)))


def _z_squared(y, ye, ym):
    z = (y - ym) / ye
    return z * z


def _replace_none(x, replacement):
    if x is None:
        return replacement
    return x


def chi2(y: ArrayLike, ye: ArrayLike, ym: ArrayLike) -> float:
    """
    Compute (potentially) chi2-distributed cost.

    The value returned by this function is chi2-distributed, if the observed values are
    normally distributed around the expected values with the provided standard
    deviations.

    Parameters
    ----------
    y : array-like with shape (N,)
        Observed values.
    ye : array-like with shape (N,)
        Uncertainties of values.
    ym : array-like with shape (N,)
        Expected values.

    Returns
    -------
    float
        Value of cost function.
    """
    y, ye, ym = np.atleast_1d(y, ye, ym)
    assert y.ndim == 1
    return np.sum(_z_squared(y, ye, ym))


def _chi2_grad(y: ArrayLike, ye: ArrayLike, ym: ArrayLike, gym: ArrayLike) -> NDArray:
    """
    Compute gradient of :func:`chi2`.

    Parameters
    ----------
    y : array-like  with shape (N,)
        Observed values.
    ye : array-like  with shape (N,)
        Uncertainties of values.
    ym : array-like with shape (N,)
        Expected values.
    gym : array-like with shape (K, N)
        Gradient of ym with respect to K model parameters.

    Returns
    -------
    array with shape (K,)
        Gradient of cost function with respect to model parameters.
    """
    y, ye, ym, gym = np.atleast_1d(y, ye, ym, gym)
    assert y.ndim == 1
    assert gym.ndim == 2
    return -2 * np.sum((y - ym) * gym * ye**-2, axis=1)


def _soft_l1_cost(y: NDArray, ye: NDArray, ym: NDArray) -> float:
    z_sqr = _z_squared(y, ye, ym)
    return 2 * np.sum(np.sqrt(1 + z_sqr) - 1)


def _soft_l1_cost_grad(y: NDArray, ye: NDArray, ym: NDArray, gym: NDArray) -> NDArray:
    inv_ye = 1 / ye
    z = (y - ym) * inv_ye
    f = (1 + z**2) ** -0.5
    return -2 * np.sum(z * inv_ye * f * gym, axis=tuple(range(1, gym.ndim)))


def poisson_chi2(n: ArrayLike, mu: ArrayLike) -> float:
    """
    Compute asymptotically chi2-distributed cost for Poisson-distributed data.

    See Baker & Cousins, NIM 221 (1984) 437-442.

    Parameters
    ----------
    n : array-like
        Observed counts.
    mu : array-like
        Expected counts per bin. Must satisfy sum(mu) == sum(n).

    Returns
    -------
    float
        Cost function value.

    Notes
    -----
    The implementation makes the result asymptotically chi2-distributed,
    which helps to maximise the numerical accuracy for Minuit.
    """
    n, mu = np.atleast_1d(n, mu)
    return 2 * np.sum(n * (log_or_zero(n) - log_or_zero(mu)) + mu - n)


def _poisson_chi2_grad(n: NDArray, mu: NDArray, gmu: NDArray) -> NDArray:
    assert gmu.ndim == 2
    return 2 * np.sum((1.0 - n / mu) * gmu, axis=1)


def multinomial_chi2(n: ArrayLike, mu: ArrayLike) -> float:
    """
    Compute asymptotically chi2-distributed cost for multinomially-distributed data.

    See Baker & Cousins, NIM 221 (1984) 437-442.

    Parameters
    ----------
    n : array-like
        Observed counts.
    mu : array-like
        Expected counts.

    Returns
    -------
    float
        Cost function value.

    Notes
    -----
    The implementation makes the result asymptotically chi2-distributed,
    which helps to maximise the numerical accuracy for Minuit.
    """
    n, mu = np.atleast_1d(n, mu)
    return 2 * np.sum(n * (log_or_zero(n) - log_or_zero(mu)))


def _multinomial_chi2_grad(n: NDArray, mu: NDArray, gmu: NDArray) -> NDArray:
    assert gmu.ndim == 2
    return -2 * np.sum(n / mu * gmu, axis=1)


def template_chi2_jsc(n: ArrayLike, mu: ArrayLike, mu_var: ArrayLike) -> float:
    """
    Compute asymptotically chi2-distributed cost for a template fit.

    J.S. Conway, PHYSTAT 2011, https://doi.org/10.48550/arXiv.1103.0354

    Parameters
    ----------
    n : array-like
        Observed counts.
    mu : array-like
        Expected counts. This is the sum of the normalised templates scaled with
        the component yields. Must be positive everywhere.
    mu_var : array-like
        Expected variance of mu. Must be positive everywhere.

    Returns
    -------
    float
        Asymptotically chi-square-distributed test statistic.

    Notes
    -----
    The implementation deviates slightly from the paper by making the result
    asymptotically chi2-distributed, which helps to maximise the numerical
    accuracy for Minuit.
    """
    n, mu, mu_var = np.atleast_1d(n, mu, mu_var)

    beta_var = mu_var / mu**2

    # Eq. 15 from https://doi.org/10.48550/arXiv.2206.12346
    p = 0.5 - 0.5 * mu * beta_var
    beta = p + np.sqrt(p**2 + n * beta_var)

    return poisson_chi2(n, mu * beta) + np.sum((beta - 1) ** 2 / beta_var)


def template_chi2_da(n: ArrayLike, mu: ArrayLike, mu_var: ArrayLike) -> float:
    """
    Compute asymptotically chi2-distributed cost for a template fit.

    H.P. Dembinski, A. Abdelmotteleb, https://doi.org/10.48550/arXiv.2206.12346

    Parameters
    ----------
    n : array-like
        Observed counts.
    mu : array-like
        Expected counts. This is the sum of the normalised templates scaled
        with the component yields.
    mu_var : array-like
        Expected variance of mu. Must be positive everywhere.

    Returns
    -------
    float
        Asymptotically chi-square-distributed test statistic.
    """
    n, mu, mu_var = np.atleast_1d(n, mu, mu_var)
    k = mu**2 / mu_var
    # avoid divide by zero
    beta = (n + k) / (mu + k + _TINY_FLOAT)
    return poisson_chi2(n, mu * beta) + poisson_chi2(k, k * beta)


def template_nll_asy(n: ArrayLike, mu: ArrayLike, mu_var: ArrayLike) -> float:
    """
    Compute marginalized negative log-likelikihood for a template fit.

    This is the negative logarithm of equation 3.15 of the paper by
    C.A. Argüelles, A. Schneider, T. Yuan,
    https://doi.org/10.1007/JHEP06(2019)030.

    The authors use a Bayesian approach and integrate over the nuisance
    parameters. Like the other Barlow-Beeston-lite methods, this is an
    approximation. The resulting likelihood cannot be turned into an
    asymptotically chi-square distributed test statistic as detailed
    in Baker & Cousins, NIM 221 (1984) 437-442.

    Parameters
    ----------
    n : array-like
        Observed counts.
    mu : array-like
        Expected counts. This is the sum of the normalised templates scaled
        with the component yields.
    mu_var : array-like
        Expected variance of mu. Must be positive everywhere.

    Returns
    -------
    float
        Negative log-likelihood function value.
    """
    from scipy.special import loggamma as lg

    n, mu, mu_var = np.atleast_1d(n, mu, mu_var)

    alpha = mu**2 / mu_var + 1
    beta = mu / mu_var
    return -np.sum(
        alpha * np.log(beta)
        + lg(n + alpha)
        - (lg(n + 1) + (n + alpha) * np.log(1 + beta) + lg(alpha))
    )


# If numba is available, use it to accelerate computations in float32 and float64
# precision. Fall back to plain numpy for float128 which is not currently supported
# by numba.
try:
    from numba import njit
    from numba.extending import overload as nb_overload

    jit = njit(nogil=True, cache=True, error_model="numpy")

    @nb_overload(log_or_zero, inline="always")
    def _ol_log_or_zero(x):
        return log_or_zero  # pragma: no cover

    @nb_overload(_z_squared, inline="always")
    def _ol_z_squared(y, ye, ym):
        return _z_squared  # pragma: no cover

    _unbinned_nll_np = _unbinned_nll
    _unbinned_nll_nb = jit(_unbinned_nll_np)

    def _unbinned_nll(x):
        if x.dtype in (np.float32, np.float64):
            return _unbinned_nll_nb(x)
        # fallback to numpy for float128
        return _unbinned_nll_np(x)

    _multinomial_chi2_np = multinomial_chi2
    _multinomial_chi2_nb = jit(_multinomial_chi2_np)

    def multinomial_chi2(n: ArrayLike, mu: ArrayLike) -> float:  # noqa
        n, mu = np.atleast_1d(n, mu)
        if mu.dtype in (np.float32, np.float64):
            return _multinomial_chi2_nb(n, mu)
        # fallback to numpy for float128
        return _multinomial_chi2_np(n, mu)

    multinomial_chi2.__doc__ = _multinomial_chi2_np.__doc__

    _poisson_chi2_np = poisson_chi2
    _poisson_chi2_nb = jit(_poisson_chi2_np)

    def poisson_chi2(n: ArrayLike, mu: ArrayLike) -> float:  # noqa
        n, mu = np.atleast_1d(n, mu)
        if mu.dtype in (np.float32, np.float64):
            return _poisson_chi2_nb(n, mu)
        # fallback to numpy for float128
        return _poisson_chi2_np(n, mu)

    poisson_chi2.__doc__ = _poisson_chi2_np.__doc__

    _chi2_np = chi2
    _chi2_nb = jit(_chi2_np)

    def chi2(y: ArrayLike, ye: ArrayLike, ym: ArrayLike) -> float:  # noqa
        y, ye, ym = np.atleast_1d(y, ye, ym)
        if ym.dtype in (np.float32, np.float64):
            return _chi2_nb(y, ye, ym)
        # fallback to numpy for float128
        return _chi2_np(y, ye, ym)

    chi2.__doc__ = _chi2_np.__doc__

    _soft_l1_cost_np = _soft_l1_cost
    _soft_l1_cost_nb = jit(_soft_l1_cost_np)

    def _soft_l1_cost(y: NDArray, ye: NDArray, ym: NDArray) -> float:
        if ym.dtype in (np.float32, np.float64):
            return _soft_l1_cost_nb(y, ye, ym)
        # fallback to numpy for float128
        return _soft_l1_cost_np(y, ye, ym)

except ModuleNotFoundError:
    pass


class Cost(abc.ABC):
    """
    Base class for all cost functions.

    :meta private:
    """

    __slots__ = ("_parameters", "_verbose")

    _parameters: Dict[str, Optional[Tuple[float, float]]]
    _verbose: int

    @property
    def errordef(self):
        """
        For internal use.

        :meta private:
        """
        return self._errordef()

    def _errordef(self):
        return CHISQUARE

    @property
    def ndata(self):
        """
        Return number of points in least-squares fits or bins in a binned fit.

        Infinity is returned if the cost function is unbinned. This is used by Minuit to
        compute the reduced chi2, a goodness-of-fit estimate.
        """
        return self._ndata()

    @property
    def npar(self):
        """Return total number of model parameters."""
        return len(self._parameters)

    @abc.abstractmethod
    def _ndata(self):
        NotImplemented  # pragma: no cover

    @property
    def verbose(self):
        """
        Access verbosity level.

        Set this to 1 to print all function calls with input and output.
        """
        return self._verbose

    @verbose.setter
    def verbose(self, value: int):
        self._verbose = value

    def __init__(
        self, parameters: Dict[str, Optional[Tuple[float, float]]], verbose: int
    ):
        """For internal use."""
        self._parameters = parameters
        self._verbose = verbose

    def __add__(self, rhs):
        """
        Add two cost functions to form a combined cost function.

        Returns
        -------
        CostSum
        """
        return CostSum(self, rhs)

    def __radd__(self, lhs):
        """
        Add two cost functions to form a combined cost function.

        Returns
        -------
        CostSum
        """
        return CostSum(lhs, self)

    def __call__(self, *args: float) -> float:
        """
        Evaluate the cost function.

        If verbose >= 1, print arguments and result.

        Parameters
        ----------
        *args : float
            Parameter values.

        Returns
        -------
        float
        """
        r = self._value(args)
        if self.verbose >= 1:
            print(args, "->", r)
        return r

    def grad(self, *args: float) -> NDArray:
        """
        Compute gradient of the cost function.

        This requires that a model gradient is provided.

        Parameters
        ----------
        *args : float
            Parameter values.

        Returns
        -------
        ndarray of float
            The length of the array is equal to the length of args.
        """
        return self._grad(args)

    @property
    def has_grad(self) -> bool:
        """Return True if cost function can compute a gradient."""
        return self._has_grad()

    @abc.abstractmethod
    def _value(self, args: Sequence[float]) -> float: ...  # pragma: no cover

    @abc.abstractmethod
    def _grad(self, args: Sequence[float]) -> NDArray: ...  # pragma: no cover

    @abc.abstractmethod
    def _has_grad(self) -> bool: ...  # pragma: no cover


class Constant(Cost):
    """
    Cost function that represents a constant.

    If your cost function produces results that are far away from O(1), adding a
    constant that brings the value closer to zero may improve the numerical stability.
    """

    __slots__ = "value"

    def __init__(self, value: float):
        """Initialize constant with a value."""
        self.value = value
        super().__init__({}, False)

    def _ndata(self):
        return 0

    def _value(self, args: Sequence[float]) -> float:
        return self.value

    def _grad(self, args: Sequence[float]) -> NDArray:
        return np.zeros(0)

    @staticmethod
    def _has_grad():
        return True


class CostSum(Cost, ABCSequence):
    """
    Sum of cost functions.

    Users do not need to create objects of this class themselves. They should just add
    cost functions, for example::

        nll = UnbinnedNLL(...)
        lsq = LeastSquares(...)
        ncs = NormalConstraint(...)
        csum = nll + lsq + ncs

    CostSum is used to combine data from different experiments or to combine normal cost
    functions with penalty terms (see NormalConstraint).

    The parameters of CostSum are the union of all parameters of its constituents.

    Supports the sequence protocol to access the constituents.

    Warnings
    --------
    CostSum does not support cost functions that accept a parameter array, because the
    function signature does not allow one to determine how many parameters are accepted
    by the function and which parameters overlap between different cost functions.
    """

    __slots__ = "_items", "_maps"

    def __init__(self, *items: Union[Cost, float]):
        """
        Initialize with cost functions.

        Parameters
        ----------
        *items : Cost
            Cost functions. May also be other CostSum functions.
        """
        self._items: List[Cost] = []
        for item in items:
            if isinstance(item, CostSum):
                self._items += item._items
            elif isinstance(item, (int, float)):
                if item != 0:
                    self._items.append(Constant(item))
            else:
                self._items.append(item)
        signatures, self._maps = merge_signatures(self._items, annotations=True)
        super().__init__(signatures, max(c.verbose for c in self._items))

    def _split(self, args: Sequence[float]):
        for component, cmap in zip(self._items, self._maps):
            component_args = tuple(args[i] for i in cmap)
            yield component, component_args

    def _value(self, args: Sequence[float]) -> float:
        r = 0.0
        for component, component_args in self._split(args):
            r += component._value(component_args) / component.errordef
        return r

    def _grad(self, args: Sequence[float]) -> NDArray:
        r = np.zeros(self.npar)
        for component, indices in zip(self._items, self._maps):
            component_args = tuple(args[i] for i in indices)
            r[indices] += component._grad(component_args) / component.errordef
        return r

    def _has_grad(self) -> bool:
        return all(component.has_grad for component in self._items)

    def _ndata(self):
        return sum(c.ndata for c in self._items)

    def __len__(self):
        """Return number of constituent cost functions."""
        return self._items.__len__()

    def __getitem__(self, key):
        """Get constituent cost function by index."""
        return self._items.__getitem__(key)

    def visualize(
        self, args: Sequence[float], component_kwargs: Dict[int, Dict[str, Any]] = None
    ):
        """
        Visualize data and model agreement (requires matplotlib).

        The visualization is drawn with matplotlib.pyplot into the current figure.
        Subplots are created to visualize each part of the cost function, the figure
        height is increased accordingly. Parts without a visualize method are silently
        ignored.

        Parameters
        ----------
        args : array-like
            Parameter values.
        component_kwargs : dict of dicts, optional
            Dict that maps an index to dict of keyword arguments. This can be
            used to pass keyword arguments to a visualize method of a component with
            that index.
        **kwargs :
            Other keyword arguments are forwarded to all components.
        """
        from matplotlib import pyplot as plt

        n = sum(hasattr(comp, "visualize") for comp in self)

        fig = plt.gcf()
        fig.set_figwidth(n * fig.get_figwidth() / 1.5)
        _, ax = plt.subplots(1, n, num=fig.number)

        if component_kwargs is None:
            component_kwargs = {}

        i = 0
        for k, (comp, cargs) in enumerate(self._split(args)):
            if not hasattr(comp, "visualize"):
                continue
            kwargs = component_kwargs.get(k, {})
            plt.sca(ax[i])
            comp.visualize(cargs, **kwargs)
            i += 1


class MaskedCost(Cost):
    """
    Base class for cost functions that support data masking.

    :meta private:
    """

    __slots__ = "_data", "_mask", "_masked"

    _mask: Optional[NDArray]

    def __init__(
        self,
        parameters: Dict[str, Optional[Tuple[float, float]]],
        data: NDArray,
        verbose: int,
    ):
        """For internal use."""
        self._data = data
        self._mask = None
        self._update_cache()
        Cost.__init__(self, parameters, verbose)

    @property
    def mask(self):
        """
        Boolean array, array of indices, or None.

        If not None, only values selected by the mask are considered. The mask acts on
        the first dimension of a value array, i.e. values[mask]. Default is None.
        """
        return self._mask

    @mask.setter
    def mask(self, mask: Optional[ArrayLike]):
        self._mask = None if mask is None else np.asarray(mask)
        self._update_cache()

    @property
    def data(self):
        """Return data samples."""
        return self._data

    @data.setter
    def data(self, value: ArrayLike):
        self._data[...] = value
        self._update_cache()

    def _update_cache(self):
        self._masked = self._data[_replace_none(self._mask, ...)]


class MaskedCostWithPulls(MaskedCost):
    """
    Base class for cost functions with pulls.

    :meta private:
    """

    def pulls(self, args: Sequence[float]) -> NDArray:
        """
        Return studentized residuals (aka pulls).

        Parameters
        ----------
        args : sequence of float
            Parameter values.

        Returns
        -------
        array
            Array of pull values. If the cost function is masked, the array contains NaN
            values where the mask value is False.

        Notes
        -----
        Pulls allow one to estimate how well a model fits the data. A pull is a value
        computed for each data point. It is given by (observed - predicted) /
        standard-deviation. If the model is correct, the expectation value of each pull
        is zero and its variance is one in the asymptotic limit of infinite samples.
        Under these conditions, the chi-square statistic is computed from the sum of
        pulls squared has a known probability distribution if the model is correct. It
        therefore serves as a goodness-of-fit statistic.

        Beware: the sum of pulls squared in general is not identical to the value
        returned by the cost function, even if the cost function returns a chi-square
        distributed test-statistic. The cost function is computed in a slightly
        differently way that makes the return value approach the asymptotic chi-square
        distribution faster than a test statistic based on sum of pulls squared. In
        summary, only use pulls for plots. Compute the chi-square test statistic
        directly from the cost function.
        """
        return self._pulls(args)

    def _ndata(self):
        return np.prod(self._masked.shape[: self._ndim])

    @abc.abstractmethod
    def _pulls(self, args: Sequence[float]) -> NDArray: ...  # pragma: no cover


class UnbinnedCost(MaskedCost):
    """
    Base class for unbinned cost functions.

    :meta private:
    """

    __slots__ = "_model", "_model_grad", "_log"

    def __init__(
        self,
        data,
        model: Model,
        verbose: int,
        log: bool,
        grad: Optional[ModelGradient],
        name: Optional[Sequence[str]],
    ):
        """For internal use."""
        self._model = model
        self._log = log
        self._model_grad = grad
        super().__init__(_model_parameters(model, name), _norm(data), verbose)

    @abc.abstractproperty
    def pdf(self):
        """Get probability density model."""
        ...  # pragma: no cover

    @abc.abstractproperty
    def scaled_pdf(self):
        """Get number density model."""
        ...  # pragma: no cover

    def _ndata(self):
        # unbinned likelihoods have infinite degrees of freedom
        return np.inf

    def _npoints(self):
        # cannot use len(self._masked) because multi-dimensional data has format
        # (K, N) with K dimensions and N points
        return self._masked.shape[-1]

    @deprecated_parameter(bins="nbins")
    def visualize(
        self,
        args: Sequence[float],
        model_points: Union[int, Sequence[float]] = 0,
        bins: int = 50,
    ):
        """
        Visualize data and model agreement (requires matplotlib).

        The visualization is drawn with matplotlib.pyplot into the current axes.

        Parameters
        ----------
        args : array-like
            Parameter values.
        model_points : int or array-like, optional
            How many points to use to draw the model. Default is 0, in this case
            an smart sampling algorithm selects the number of points. If array-like,
            it is interpreted as the point locations.
        bins : int, optional
            number of bins. Default is 50 bins.
        """
        from matplotlib import pyplot as plt

        x = np.sort(self.data)

        if x.ndim > 1:
            raise ValueError("visualize is not implemented for multi-dimensional data")

        # this implementation only works with a histogram with linear spacing

        if isinstance(model_points, Iterable):
            xm = np.array(model_points)
            ym = self.scaled_pdf(xm, *args)
        elif model_points > 0:
            if _detect_log_spacing(x):
                xm = np.geomspace(x[0], x[-1], model_points)
            else:
                xm = np.linspace(x[0], x[-1], model_points)
            ym = self.scaled_pdf(xm, *args)
        else:
            xm, ym = _smart_sampling(lambda x: self.scaled_pdf(x, *args), x[0], x[-1])

        # use xm for range, which may be narrower or wider than x range
        n, xe = np.histogram(x, bins=bins, range=(xm[0], xm[-1]))
        cx = 0.5 * (xe[1:] + xe[:-1])
        dx = xe[1] - xe[0]

        plt.errorbar(cx, n, n**0.5, fmt="ok")
        plt.fill_between(xm, 0, ym * dx, fc="C0")

    def fisher_information(self, *args: float) -> NDArray:
        """
        Estimate Fisher information for model and sample.

        The estimated Fisher information is only meaningful if the arguments provided
        are estimates of the true values.

        Parameters
        ----------
        *args: float
            Estimates of model parameters.
        """
        g = self._pointwise_score(args)
        return np.einsum("ji,ki->jk", g, g)

    def covariance(self, *args: float) -> NDArray:
        """
        Estimate covariance of the parameters with the sandwich estimator.

        This requires that the model gradient is provided, and that the arguments are
        the maximum-likelihood estimates. The sandwich estimator is only asymptotically
        correct.

        Parameters
        ----------
        *args : float
            Maximum-likelihood estimates of the parameter values.

        Returns
        -------
        ndarray of float
            The array has shape (K, K) for K arguments.
        """
        return np.linalg.inv(self.fisher_information(*args))

    @abc.abstractmethod
    def _pointwise_score(
        self, args: Sequence[float]
    ) -> NDArray: ...  # pragma: no cover

    def _has_grad(self) -> bool:
        return self._model_grad is not None


class UnbinnedNLL(UnbinnedCost):
    """
    Unbinned negative log-likelihood.

    Use this if only the shape of the fitted PDF is of interest and the original
    unbinned data is available. The data can be one- or multi-dimensional.
    """

    __slots__ = ()

    @property
    def pdf(self):
        """Get probability density model."""
        if self._log:
            return lambda *args: np.exp(self._model(*args))
        return self._model

    @property
    def scaled_pdf(self):
        """Get number density model."""
        scale = np.prod(self.data.shape)
        if self._log:
            return lambda *args: scale * np.exp(self._model(*args))
        return lambda *args: scale * self._model(*args)

    def __init__(
        self,
        data: ArrayLike,
        pdf: Model,
        *,
        verbose: int = 0,
        log: bool = False,
        grad: Optional[ModelGradient] = None,
        name: Optional[Sequence[str]] = None,
    ):
        """
        Initialize UnbinnedNLL with data and model.

        Parameters
        ----------
        data : array-like
            Sample of observations. If the observations are multidimensional, data must
            have the shape (D, N), where D is the number of dimensions and N the number
            of data points.
        pdf : callable
            Probability density function of the form f(data, par0, [par1, ...]), where
            data is the data sample and par0, ... are model parameters. If the data are
            multivariate, data passed to f has shape (D, N), where D is the number of
            dimensions and N the number of data points. Must return an array with the
            shape (N,).
        verbose : int, optional
            Verbosity level. 0: is no output (default). 1: print current args and
            negative log-likelihood value.
        log : bool, optional
            Distributions of the exponential family (normal, exponential, poisson, ...)
            allow one to compute the logarithm of the pdf directly, which is more
            accurate and efficient than numerically computing ``log(pdf)``. Set this
            to True, if the model returns the logpdf instead of the pdf.
            Default is False.
        grad : callable or None, optional
            Optionally pass the gradient of the pdf. Has the same calling signature like
            the pdf, but must return an array with the shape (K, N), where N is the
            number of data points and K is the number of parameters. If `log` is True,
            the function must return the gradient of the logpdf instead of the pdf. The
            gradient can be used by Minuit to improve or speed up convergence and to
            compute the sandwich estimator for the variance of the parameter estimates.
            Default is None.
        name : sequence of str or None, optional
            Optional names for each parameter of the model (in order). Must have the
            same length as there are model parameters. Default is None.
        """
        super().__init__(data, pdf, verbose, log, grad, name)

    def _value(self, args: Sequence[float]) -> float:
        f = self._eval_model(args)
        if self._log:
            return -2.0 * np.sum(f)
        return 2.0 * _unbinned_nll(f)

    def _grad(self, args: Sequence[float]) -> NDArray:
        g = self._pointwise_score(args)
        return -2.0 * np.sum(g, axis=1)

    def _pointwise_score(self, args: Sequence[float]) -> NDArray:
        g = self._eval_model_grad(args)
        if self._log:
            return g
        f = self._eval_model(args)
        return g / f

    def _eval_model(self, args: Sequence[float]) -> float:
        data = self._masked
        return _normalize_output(self._model(data, *args), "model", self._npoints())

    def _eval_model_grad(self, args: Sequence[float]) -> NDArray:
        if self._model_grad is None:
            raise ValueError("no gradient available")  # pragma: no cover
        data = self._masked
        return _normalize_output(
            self._model_grad(data, *args), "model gradient", self.npar, self._npoints()
        )


class ExtendedUnbinnedNLL(UnbinnedCost):
    """
    Unbinned extended negative log-likelihood.

    Use this if shape and normalization of the fitted PDF are of interest and the
    original unbinned data is available. The data can be one- or multi-dimensional.
    """

    __slots__ = ()

    @property
    def pdf(self):
        """Get probability density model."""
        if self._log:

            def fn(*args):
                n, x = self._model(*args)
                return np.exp(x) / n

        else:

            def fn(*args):
                n, x = self._model(*args)
                return x / n

        return fn

    @property
    def scaled_pdf(self):
        """Get density model."""
        if self._log:
            return lambda *args: np.exp(self._model(*args)[1])
        return lambda *args: self._model(*args)[1]

    def __init__(
        self,
        data: ArrayLike,
        scaled_pdf: Model,
        *,
        verbose: int = 0,
        log: bool = False,
        grad: Optional[ModelGradient] = None,
        name: Optional[Sequence[str]] = None,
    ):
        """
        Initialize cost function with data and model.

        Parameters
        ----------
        data : array-like
            Sample of observations. If the observations are multidimensional, data must
            have the shape (D, N), where D is the number of dimensions and N the number
            of data points.
        scaled_pdf : callable
            Density function of the form f(data, par0, [par1, ...]), where data is the
            sample and par0, ... are model parameters. Must return a tuple (<integral
            over f in data window>, <f evaluated at data points>). The first value is
            the density integrated over the data window, the interval that we consider
            for the fit. For example, if the data are exponentially distributed, but we
            fit only the interval (0, 5), then the first value is the density integrated
            from 0 to 5. If the data are multivariate, data passed to f has shape (D,
            N), where D is the number of dimensions and N the number of data points.
        verbose : int, optional
            Verbosity level. 0: is no output (default). 1: print current args and
            negative log-likelihood value.
        log : bool, optional
            Distributions of the exponential family (normal, exponential, poisson, ...)
            allow one to compute the logarithm of the pdf directly, which is more
            accurate and efficient than effectively doing ``log(exp(logpdf))``. Set this
            to True, if the model returns the logarithm of the density as the second
            argument instead of the density. Default is False.
        grad : callable or None, optional
            Optionally pass the gradient of the density function. Has the same calling
            signature like the density function, but must return two arrays. The first
            array has shape (K,) where K are the number of parameters, while the second
            has shape (K, N), where N is the number of data points. The first array is
            the gradient of the integrated density. The second array is the gradient of
            the density itself. If `log` is True, the second array must be the gradient
            of the log-density instead. The gradient can be used by Minuit to improve or
            speed up convergence and to compute the sandwich estimator for the variance
            of the parameter estimates. Default is None.
        name : sequence of str or None, optional
            Optional names for each parameter of the model (in order). Must have the
            same length as there are model parameters. Default is None.
        """
        super().__init__(data, scaled_pdf, verbose, log, grad, name)

    def _value(self, args: Sequence[float]) -> float:
        fint, f = self._eval_model(args)
        if self._log:
            return 2 * (fint - np.sum(f))
        return 2 * (fint + _unbinned_nll(f))

    def _grad(self, args: Sequence[float]) -> NDArray:
        g = self._pointwise_score(args)
        return -2 * np.sum(g, axis=1)

    def _pointwise_score(self, args: Sequence[float]) -> NDArray:
        gint, g = self._eval_model_grad(args)
        m = self._npoints()
        if self._log:
            return g - (gint / m)[:, np.newaxis]
        _, f = self._eval_model(args)
        return g / f - (gint / m)[:, np.newaxis]

    def _eval_model(self, args: Sequence[float]) -> Tuple[float, float]:
        data = self._masked
        fint, f = self._model(data, *args)
        f = _normalize_output(f, "model", self._npoints(), msg="in second position")
        return fint, f

    def _eval_model_grad(self, args: Sequence[float]) -> Tuple[NDArray, NDArray]:
        if self._model_grad is None:
            raise ValueError("no gradient available")  # pragma: no cover
        data = self._masked
        gint, g = self._model_grad(data, *args)
        gint = _normalize_output(
            gint, "model gradient", self.npar, msg="in first position"
        )
        g = _normalize_output(
            g, "model gradient", self.npar, self._npoints(), msg="in second position"
        )
        return gint, g


class BinnedCost(MaskedCostWithPulls):
    """
    Base class for binned cost functions to support histograms filled with weights.

    Histograms filled with weights are supported by applying the Bohm-Zech transform.

    The Bohm-Zech approach was further generalized to handle sums of weights which are
    negative. See Baker & Cousins, NIM 221 (1984) 437-442; Bohm and Zech, NIMA 748
    (2014) 1-6; H. Dembinski, M. Schmelling, R. Waldi, Nucl.Instrum.Meth.A 940 (2019)
    135-141.

    Bohm and Zech use the scaled Poisson distribution (SPD) as an approximate way to
    handle sums of weights instead of Poisson counts. This approach also works for
    multinomial distributions. The idea of the Bohm and Zech is to use the likelihood
    for Poisson distributed data also for weighted data. They show that one can match
    the first and second moment of the compound Poisson distribution for weighted data
    with a single Poisson distribution with a scaling factor s, that is multiplied with
    the predicted expectation and the observation.

    This scaling factor is computed as s = sum(wi) / sum(wi**2), wi are the weights in
    the current bin. Instead of the Baker & Cousins transformed log-likelihood
    l(n; mu) for Poisson-distributed data, where n is the observed count and mu is the
    expectation, we now compute l(sum(w) * s; mu * s), this can be further simplified:

    l(w * s, mu * s) = 2 * [(w * s) * (log(w * s) - log(mu * s)) - s * mu + s * w]
                     = 2 * s * [w * (log(w) - log(mu)) - mu + w]
                     = s * l(w, mu)

    For multinomially-distributed data and s = 1, sum(w-mu) = 0, which is why these
    terms can be omitted in the standard calculation without weights, but in case of
    weighted counts, sum(s * (w - m)) != 0 and the terms must be kept.

    The original formulas from Bohm and Zech are only applicable if w >= 0 (with the
    extra condition that w * log(w) evaluates to 0 for w = 0). One can generalize the
    formula to w < 0, which is relevant in practice for example in fits of sweighted
    samples, by computing s = abs(sum(wi)) / sum(wi ** 2) and replacing w * log(w) with
    0 for w <= 0.

    This works, because this extension has the right gradient. The gradient should be
    equal to hat of the quadratic function s * (w - mu)**2/mu', where mu'=mu but fixed
    during the gradient computation, see D. Dembinski, M. Schmelling, R. Waldi. The
    minimum of this quadratic function yields an unbiased estimate of mu, even if some w
    are negative. Since the quadratic function and the original function have the same
    gradient, the minima of both functions are the same, and the original function also
    yields an unbiased estimate.

    The gradient is not affected by the particular choice of how to handle w * log(w)
    with w < 0, since this term drops out in the computation of the gradient. Other
    choices are possible. Our goal was to select an option which keeps the function
    minimum approximately chi-square distributed, although that property tends to
    dissolve when negative weights are involved. The minimum can even become negative.

    :meta private:
    """

    __slots__ = "_xe", "_ndim", "_bohm_zech_n", "_bohm_zech_s"

    _xe: Union[NDArray, Tuple[NDArray, ...]]
    _ndim: int
    _bohm_zech_n: NDArray
    _bohm_zech_s: Optional[NDArray]

    n = MaskedCost.data

    @property
    def xe(self):
        """Access bin edges."""
        return self._xe

    def __init__(
        self,
        parameters: Dict[str, Optional[Tuple[float, float]]],
        n: ArrayLike,
        xe: Union[ArrayLike, Sequence[ArrayLike]],
        verbose: int,
    ):
        """For internal use."""
        if not isinstance(xe, Iterable):
            raise ValueError("xe must be iterable")

        shape = _shape_from_xe(xe)
        self._ndim = len(shape)
        if self._ndim == 1:
            self._xe = _norm(cast(ArrayLike, xe))
        else:
            self._xe = tuple(_norm(xei) for xei in xe)

        n = _norm(n)

        is_weighted = n.ndim > self._ndim and n.shape[-1] == 2

        if n.ndim != (self._ndim + int(is_weighted)):
            raise ValueError("n must either have same dimension as xe or one extra")

        xei: NDArray
        for i, xei in enumerate([self._xe] if self._ndim == 1 else self._xe):
            if len(xei) != n.shape[i] + 1:
                raise ValueError(
                    f"n and xe have incompatible shapes along dimension {i}, "
                    "xe must be longer by one element along each dimension"
                )

        # _bohm_zech_s will be set properly when init of base class
        # is called, which in turn calls our _update_cache() override
        self._bohm_zech_s = np.zeros(0) if is_weighted else None
        super().__init__(parameters, n, verbose)

    def prediction(
        self, args: Sequence[float]
    ) -> Union[NDArray, Tuple[NDArray, NDArray]]:
        """
        Return the bin-wise expectation for the fitted model.

        Parameters
        ----------
        args : array-like
            Parameter values.

        Returns
        -------
        NDArray
            Model prediction for each bin. The expectation is always returned for all
            bins, even if some bins are temporarily masked.
        """
        return self._pred(args)

    def visualize(self, args: Sequence[float]) -> None:
        """
        Visualize data and model agreement (requires matplotlib).

        The visualization is drawn with matplotlib.pyplot into the current axes.

        Parameters
        ----------
        args : sequence of float
            Parameter values.

        Notes
        -----
        The automatically provided visualization for multi-dimensional data set is often
        not very pretty, but still helps to judge whether the fit is reasonable. Since
        there is no obvious way to draw higher dimensional data with error bars in
        comparison to a model, the visualization shows all data bins as a single
        sequence.
        """
        return self._visualize(args)

    def _visualize(self, args: Sequence[float]) -> None:
        from matplotlib import pyplot as plt

        n, ne = self._n_err()
        mu = self.prediction(args)
        assert not isinstance(mu, tuple)

        if self._ndim > 1:
            # flatten higher-dimensional data
            n = n.reshape(-1)
            ne = ne.reshape(-1)
            mu = mu.reshape(-1)
            # just use bin numbers instead of original values
            xe = np.arange(len(n) + 1) - 0.5
            cx = np.arange(len(n)).astype(float)
        else:
            xe = self.xe
            cx = 0.5 * (xe[1:] + xe[:-1])
        plt.errorbar(cx, n, ne, fmt="ok")
        plt.stairs(mu, xe, fill=True, color="C0")

    @abc.abstractmethod
    def _pred(
        self, args: Sequence[float]
    ) -> Union[NDArray, Tuple[NDArray, NDArray]]: ...  # pragma: no cover

    def _n_err(self) -> Tuple[NDArray, NDArray]:
        d = self.data
        if self._bohm_zech_s is None:
            n = d.copy()
            err = d**0.5
        else:
            n = d[..., 0].copy()
            err = d[..., 1] ** 0.5
        # mask values where error is zero
        ma = err == 0
        if self.mask is not None:
            ma = ~self.mask
        n[ma] = np.nan
        err[ma] = np.nan
        return n, err

    def _pulls(self, args: Sequence[float]) -> NDArray:
        mu = self.prediction(args)
        n, ne = self._n_err()
        return (n - mu) / ne

    def _update_cache(self):
        super()._update_cache()
        n = self._masked
        if self._bohm_zech_s is not None:
            val = n[..., 0]
            var = n[..., 1]
            s = np.zeros_like(val)
            ma = var > 0
            s[ma] = np.abs(val[ma]) / var[ma]
            # Use median of s from bins with entries to bins which have zero entries.
            # This is arbitrary, but still better than other arbitrary choices.
            s[~ma] = np.median(s[ma])
            self._bohm_zech_s = s
            self._bohm_zech_n = val * s
        else:
            self._bohm_zech_n = n

    def _transformed(self, val: NDArray) -> Tuple[NDArray, NDArray]:
        s = self._bohm_zech_s
        ma = self.mask
        if ma is not None:
            val = val[ma]
        n = self._bohm_zech_n
        if s is None:
            return n, val
        return n, val * s

    def _transformed2(
        self, val: NDArray, var: NDArray
    ) -> Tuple[NDArray, NDArray, NDArray]:
        s = self._bohm_zech_s
        ma = self.mask
        if ma is not None:
            val = val[ma]
            var = var[ma]
        n = self._bohm_zech_n
        if s is None:
            return n, val, var
        return n, val * s, var * s**2

    def _counts(self):
        if self._bohm_zech_s is None:
            return self._masked
        return self._masked[..., 0]


class BinnedCostWithModel(BinnedCost):
    """
    Base class for binned cost functions with parametric model.

    :meta private:
    """

    __slots__ = (
        "_xe_shape",
        "_model",
        "_model_xe",
        "_model_xm",
        "_model_dx",
        "_model_len",
        "_model_grad",
        "_pred_impl",
    )

    _model_xe: np.ndarray
    _xe_shape: Union[Tuple[int], Tuple[int, ...]]

    def __init__(self, n, xe, model, verbose, grad, use_pdf, name):
        """For internal use."""
        self._model = model
        self._model_grad = grad

        if use_pdf and grad:
            raise ValueError("keywords use_pdf and grad cannot be used together")

        if use_pdf == "approximate":
            self._pred_impl = self._pred_approximate
        elif use_pdf == "numerical":
            self._pred_impl = self._pred_numerical
        elif use_pdf == "":
            self._pred_impl = self._pred_cdf
        else:
            msg = (
                f"use_pdf={use_pdf} is not understood, "
                "allowed values are '', 'approximate', or 'numerical'"
            )
            raise ValueError(msg)

        super().__init__(_model_parameters(model, name), n, xe, verbose)

        if self._ndim == 1:
            self._xe_shape = (len(self.xe),)
            self._model_xe = _norm(self.xe)
            if use_pdf:
                dx = np.diff(self._model_xe)
                self._model_dx = dx
                self._model_xm = self._model_xe[:-1] + 0.5 * dx
        else:
            self._xe_shape = tuple(len(xei) for xei in self.xe)
            self._model_xe = np.vstack(
                [x.flatten() for x in np.meshgrid(*self.xe, indexing="ij")]
            )
            if use_pdf == "approximate":
                dx = [np.diff(xe) for xe in self.xe]
                xm = [xei[:-1] + 0.5 * dxi for (xei, dxi) in zip(self.xe, dx)]
                xm = np.meshgrid(*xm, indexing="ij")
                dx = np.meshgrid(*dx, indexing="ij")
                self._model_xm = np.array(xm)
                self._model_dx = np.prod(dx, axis=0)
            elif use_pdf == "numerical":
                raise ValueError(
                    'use_pdf="numerical" is not supported for '
                    "multidimensional histograms"
                )

        self._model_len = np.prod(self._xe_shape)

    def _pred(self, args: Sequence[float]) -> NDArray:
        return self._pred_impl(args)

    def _pred_cdf(self, args: Sequence[float]) -> NDArray:
        d = self._model(self._model_xe, *args)
        d = _normalize_output(d, "model", self._model_len)
        if self._ndim > 1:
            d = d.reshape(self._xe_shape)
        for i in range(self._ndim):
            d = np.diff(d, axis=i)
        # differences can come out negative due to round-off error in subtraction,
        # we set negative values to zero
        d[d < 0] = 0
        return d

    def _pred_approximate(self, args: Sequence[float]) -> NDArray:
        y = self._model(self._model_xm, *args)
        return y * self._model_dx

    def _pred_numerical(self, args: Sequence[float]) -> NDArray:
        from scipy.integrate import quad

        assert self._ndim == 1

        d = np.empty(self._model_len - 1)
        for i in range(self._model_len - 1):
            a = self._model_xe[i]
            b = self._model_xe[i + 1]
            d[i] = quad(lambda x: self._model(x, *args), a, b)[0]
        return d

    def _pred_grad(self, args: Sequence[float]) -> NDArray:
        d = self._model_grad(self._model_xe, *args)
        d = _normalize_output(d, "model gradient", self.npar, self._model_len)
        if self._ndim > 1:
            d = d.reshape((self.npar, *self._xe_shape))
        for i in range(1, self._ndim + 1):
            d = np.diff(d, axis=i)
        return d

    def _has_grad(self) -> bool:
        return self._model_grad is not None


class Template(BinnedCost):
    """
    Binned cost function for a template fit with uncertainties on the template.

    This cost function is for a mixture of components. Use this if the sample originate
    from two or more components and you are interested in estimating the yield that
    originates from one or more components. In high-energy physics, one component is
    often a peaking signal over a smooth background component. A component can be
    described by a parametric model or a template.

    A parametric model is accepted in form of a scaled cumulative density function,
    while a template is a non-parametric shape estimate obtained by histogramming a
    Monte-Carlo simulation. Even if the Monte-Carlo simulation is asymptotically
    correct, estimating the shape from a finite simulation sample introduces some
    uncertainty. This cost function takes that additional uncertainty into account.

    There are several ways to fit templates and take the sampling uncertainty into
    account. Barlow and Beeston [1]_ found an exact likelihood for this problem, with
    one nuisance parameter per component per bin. Solving this likelihood is somewhat
    challenging though. The Barlow-Beeston likelihood also does not handle the
    additional uncertainty in weighted templates unless the weights per bin are all
    equal.

    Other works [2]_ [3]_ [4]_ describe likelihoods that use only one nuisance parameter
    per bin, which is an approximation. Some marginalize over the nuisance parameters
    with some prior, while others profile over the nuisance parameter. This class
    implements several of these methods. The default method is the one which performs
    best under most conditions, according to current knowledge. The default may change
    if this assessment changes.

    The cost function returns an asymptotically chi-square distributed test statistic,
    except for the method "asy", where it is the negative logarithm of the marginalised
    likelihood instead. The standard transform [5]_ which we use convert likelihoods
    into test statistics only works for (profiled) likelihoods, not for likelihoods
    marginalized over a prior.

    All methods implemented here have been generalized to work with both weighted data
    and weighted templates, under the assumption that the weights are independent of the
    data. This is not the case for sWeights, and the uncertaintes for results obtained
    with sWeights will only be approximately correct [6]_. The methods have been further
    generalized to allow fitting a mixture of parametric models and templates.

    .. [1] Barlow and Beeston, Comput.Phys.Commun. 77 (1993) 219-228
    .. [2] Conway, PHYSTAT 2011 proceeding, https://doi.org/10.48550/arXiv.1103.0354
    .. [3] Argüelles, Schneider, Yuan, JHEP 06 (2019) 030
    .. [4] Dembinski and Abdelmotteleb, https://doi.org/10.48550/arXiv.2206.12346
    .. [5] Baker and Cousins, NIM 221 (1984) 437-442
    .. [6] Langenbruch, Eur.Phys.J.C 82 (2022) 5, 393
    """

    __slots__ = "_model_data", "_model_xe", "_xe_shape", "_impl", "_model_len"

    _model_data: List[
        Union[
            Tuple[NDArray, NDArray],
            Tuple[Model, float],
        ]
    ]
    _model_xe: np.ndarray
    _xe_shape: Union[Tuple[int], Tuple[int, ...]]

    def __init__(
        self,
        n: ArrayLike,
        xe: Union[ArrayLike, Sequence[ArrayLike]],
        model_or_template: Collection[Union[Model, ArrayLike]],
        *,
        name: Optional[Sequence[str]] = None,
        verbose: int = 0,
        method: str = "da",
    ):
        """
        Initialize cost function with data and model.

        Parameters
        ----------
        n : array-like
            Histogram counts. If this is an array with dimension D+1, where D is the
            number of histogram axes, then the last dimension must have two elements and
            is interpreted as pairs of sum of weights and sum of weights squared.
        xe : array-like or collection of array-like
            Bin edge locations, must be len(n) + 1, where n is the number of bins. If
            the histogram has more than one axis, xe must be a collection of the bin
            edge locations along each axis.
        model_or_template : collection of array-like or callable
            Collection of models or arrays. An array represent the histogram counts of a
            template. The template histograms must use the same axes as the data
            histogram. If the counts are represented by an array with dimension D+1,
            where D is the number of histogram axes, then the last dimension must have
            two elements and is interpreted as pairs of sum of weights and sum of
            weights squared. Callables must return the model cdf evaluated as xe.
        name : sequence of str or None, optional
            Optional name for the yield of each template and the parameter of each model
            (in order). Must have the same length as there are templates and model
            parameters in templates_or_model. Default is None.
        verbose : int, optional
            Verbosity level. 0: is no output (default). 1: print current args and
            negative log-likelihood value.
        method : {"jsc", "asy", "da"}, optional
            Which method to use. "jsc": Conway's method [2]_. "asy": ASY method [3]_.
            "da": DA method [4]_. Default is "da", which to current knowledge offers the
            best overall performance. The default may change in the future, so please
            set this parameter explicitly in code that has to be stable. For all methods
            except the "asy" method, the minimum value is chi-square distributed.
        """
        M = len(model_or_template)
        if M < 1:
            raise ValueError("at least one template or model is required")

        shape = _shape_from_xe(xe)
        ndim = len(shape)

        npar = 0
        annotated: Dict[str, Optional[Tuple[float, float]]] = {}
        self._model_data = []
        for i, t in enumerate(model_or_template):
            if isinstance(t, Collection):
                tt = _norm(t)
                if tt.ndim > ndim:
                    # template is weighted
                    if tt.ndim != ndim + 1 or tt.shape[:-1] != shape:
                        raise ValueError("shapes of n and templates do not match")
                    t1 = tt[..., 0].copy()
                    t2 = tt[..., 1].copy()
                else:
                    if tt.ndim != ndim or tt.shape != shape:
                        raise ValueError("shapes of n and templates do not match")
                    t1 = tt.copy()
                    t2 = tt.copy()
                # normalize to unity
                f = 1 / np.sum(t1)
                t1 *= f
                t2 *= f**2
                self._model_data.append((t1, t2))
                annotated[f"x{i}"] = (0.0, np.inf)
            elif isinstance(t, Model):
                ann = _model_parameters(t, None)
                npar = len(ann)
                self._model_data.append((t, npar))
                for k in ann:
                    annotated[f"x{i}_{k}"] = ann[k]
            else:
                raise ValueError(
                    "model_or_template must be a collection of array-likes "
                    "and/or Model types"
                )

        if name is not None:
            if len(annotated) != len(name):
                raise ValueError(
                    "number of names must match number of templates and "
                    "model parameters"
                )
            annotated = {new: annotated[old] for (old, new) in zip(annotated, name)}

        known_methods = {
            "jsc": template_chi2_jsc,
            "asy": template_nll_asy,
            "hpd": template_chi2_da,
            "da": template_chi2_da,
        }
        try:
            self._impl = known_methods[method]
        except KeyError:
            raise ValueError(
                f"method {method} is not understood, allowed values: {known_methods}"
            )

        if method == "hpd":
            warnings.warn(
                "key 'hpd' is deprecated, please use 'da' instead",
                category=FutureWarning,
                stacklevel=2,
            )

        super().__init__(annotated, n, xe, verbose)

        if self._ndim == 1:
            self._xe_shape = (len(self.xe),)
            self._model_xe = _norm(self.xe)
        else:
            self._xe_shape = tuple(len(xei) for xei in self.xe)
            self._model_xe = np.vstack(
                [x.flatten() for x in np.meshgrid(*self.xe, indexing="ij")]
            )
        self._model_len = np.prod(self._xe_shape)

    def _pred(self, args: Sequence[float]) -> Tuple[NDArray, NDArray]:
        mu: NDArray = 0  # type:ignore
        mu_var: NDArray = 0  # type:ignore
        i = 0
        for t1, t2 in self._model_data:
            if isinstance(t1, np.ndarray) and isinstance(t2, np.ndarray):
                a = args[i]
                mu += a * t1
                mu_var += a**2 * t2
                i += 1
            elif isinstance(t1, Model) and isinstance(t2, int):
                d = t1(self._model_xe, *args[i : i + t2])
                d = _normalize_output(d, "model", self._model_len)
                if self._ndim > 1:
                    d = d.reshape(self._xe_shape)
                for j in range(self._ndim):
                    d = np.diff(d, axis=j)
                # differences can come out negative due to round-off error in
                # subtraction, we set negative values to zero
                d[d < 0] = 0
                mu += d
                mu_var += np.ones_like(mu) * 1e-300
                i += t2
            else:  # never arrive here
                assert False  # pragma: no cover
        return mu, mu_var

    def _value(self, args: Sequence[float]) -> float:
        mu, mu_var = self._pred(args)
        n, mu, mu_var = self._transformed2(mu, mu_var)
        ma = mu > 0
        return self._impl(n[ma].reshape(-1), mu[ma].reshape(-1), mu_var[ma].reshape(-1))

    def _grad(self, args: Sequence[float]) -> NDArray:
        raise NotImplementedError  # pragma: no cover

    def _has_grad(self) -> bool:
        return False

    def _errordef(self) -> float:
        return NEGATIVE_LOG_LIKELIHOOD if self._impl is template_nll_asy else CHISQUARE

    def prediction(self, args: Sequence[float]) -> Tuple[NDArray, NDArray]:
        """
        Return the fitted template and its standard deviation.

        This returns the prediction from the templates, the sum over the products of the
        template yields with the normalized templates. The standard deviation is
        returned as the second argument, this is the estimated uncertainty of the fitted
        template alone. It is obtained via error propagation, taking the statistical
        uncertainty in the template into account, but regarding the yields as parameters
        without uncertainty.

        Parameters
        ----------
        args : array-like
            Parameter values.

        Returns
        -------
        y, yerr : NDArray, NDArray
            Template prediction and its standard deviation, based on the statistical
            uncertainty of the template only.
        """
        mu, mu_var = self._pred(args)
        return mu, np.sqrt(mu_var)

    def _visualize(self, args: Sequence[float]) -> None:
        from matplotlib import pyplot as plt

        n, ne = self._n_err()
        mu, mue = self.prediction(args)  # type: ignore

        # see implementation notes in BinnedCost.visualize
        if self._ndim > 1:
            n = n.reshape(-1)
            ne = ne.reshape(-1)
            mu = mu.reshape(-1)
            mue = mue.reshape(-1)
            xe = np.arange(len(n) + 1) - 0.5
            cx = np.arange(len(n)).astype(float)
        else:
            xe = self.xe
            cx = 0.5 * (xe[1:] + xe[:-1])

        plt.errorbar(cx, n, ne, fmt="ok")

        # need fill=True and fill=False so that bins with mue=0 show up
        for fill in (False, True):
            plt.stairs(mu + mue, xe, baseline=mu - mue, fill=fill, color="C0")

    def _pulls(self, args: Sequence[float]) -> NDArray:
        mu, mue = self.prediction(args)
        n, ne = self._n_err()
        return (n - mu) / (mue**2 + ne**2) ** 0.5


class BinnedNLL(BinnedCostWithModel):
    """
    Binned negative log-likelihood.

    Use this if only the shape of the fitted PDF is of interest and the data is binned.
    This cost function works with normal and weighted histograms. The histogram can be
    one- or multi-dimensional.

    The cost function has a minimum value that is asymptotically chi2-distributed. It is
    constructed from the log-likelihood assuming a multivariate-normal distribution and
    using the saturated model as a reference, see :func:`multinomial_chi2` for details.

    When this class is used with weighted data, we use the Bohm-Zech transform for
    Poisson-distributed data and the :func:`poisson_chi2` cost function, because
    :func:`multinomial_chi2` yields biased results for weighted data. The
    reasoning for this choice is that :func:`multinomial_chi2` and :func:`poisson_chi2`
    yield the same result for a model which predicts probabilities and expected counts
    are computed by multiplying the probability with the total number of counts. Thus we
    can derive :func:`multinomial_chi2` as a special case of :func:`poisson_chi2` in
    case of unweighted data, but this mathematical equivalence is gone when data are
    weighted. The correct cost function is then :func:`poisson_chi2`.
    """

    __slots__ = ("_chi2",)

    @property
    def cdf(self):
        """Get cumulative density function."""
        return self._model

    def __init__(
        self,
        n: ArrayLike,
        xe: Union[ArrayLike, Sequence[ArrayLike]],
        cdf: Model,
        *,
        verbose: int = 0,
        grad: Optional[ModelGradient] = None,
        use_pdf: str = "",
        name: Optional[Sequence[str]] = None,
    ):
        """
        Initialize cost function with data and model.

        Parameters
        ----------
        n : array-like
            Histogram counts. If this is an array with dimension D+1, where D is the
            number of histogram axes, then the last dimension must have two elements
            and is interpreted as pairs of sum of weights and sum of weights squared.
        xe : array-like or collection of array-like
            Bin edge locations, must be len(n) + 1, where n is the number of bins.
            If the histogram has more than one axis, xe must be a collection of the
            bin edge locations along each axis.
        cdf : callable
            Cumulative density function of the form f(xe, par0, par1, ..., parN),
            where xe is a bin edge and par0, ... are model parameters. The corresponding
            density must be normalized to unity over the space covered by the histogram.
            If the model is multivariate, xe must be an array-like with shape (D, N),
            where D is the dimension and N is the number of points where the model is
            evaluated.
        verbose : int, optional
            Verbosity level. 0: is no output (default).
            1: print current args and negative log-likelihood value.
        grad: callable or None, optional
            Optionally pass the gradient of the cdf (Default is None). Has the same
            calling signature like the cdf, but must return an array with the shape (K,
            N), where N is the number of data points and K is the number of parameters.
            The gradient can be used by Minuit to improve or speed up convergence.
        use_pdf: str, optional
            Either "", "numerical", or "approximate" (Default is ""). If the model cdf
            is not available, but the model pdf is, this option can be set to
            "numerical" or "approximate" to compute the integral of the pdf over the bin
            patch. The option "numerical" uses numerical integration, which is accurate
            but computationally expensive and only supported for 1D histograms. The
            option "approximate" uses the zero-order approximation of evaluating the pdf
            at the bin center, multiplied with the bin area. This is fast and works in
            higher dimensions, but can lead to biased results if the curvature of the
            pdf inside the bin is significant.
        name : sequence of str or None, optional
            Optional names for each parameter of the model (in order). Must have the
            same length as there are model parameters. Default is None.
        """
        super().__init__(n, xe, cdf, verbose, grad, use_pdf, name)
        if self._bohm_zech_s is None:
            self._chi2 = multinomial_chi2
        else:
            self._chi2 = poisson_chi2

    def _pred(self, args: Sequence[float]) -> NDArray:
        # must return array of full length, mask not applied yet
        p = super()._pred(args)
        # normalise probability of remaining bins
        ma = self.mask
        if ma is not None:
            p /= np.sum(p[ma])
        # scale probabilities with total number of entries of unmasked bins in histogram
        return p * np.sum(self._counts())

    def _value(self, args: Sequence[float]) -> float:
        mu = self._pred(args)
        n, mu = self._transformed(mu)
        return self._chi2(n.reshape(-1), mu.reshape(-1))

    def _grad(self, args: Sequence[float]) -> NDArray:
        # pg and p must be arrays of full length, mask not applied yet
        pg = super()._pred_grad(args)
        p = super()._pred(args)
        ma = self.mask
        # normalise probability of remaining bins
        if ma is not None:
            psum = np.sum(p[ma])
            pg = pg / psum - p * np.sum(pg[:, ma]) / psum**2
            p /= psum
        # scale probabilities with total number of entries of unmasked bins in histogram
        n = self._counts()
        ntot = np.sum(n)
        mu = p * ntot
        gmu = pg * ntot
        ma = self.mask
        if ma is not None:
            mu = mu[ma]
            gmu = gmu[:, ma]
        n = n.reshape(-1)
        mu = mu.reshape(-1)
        gmu = gmu.reshape(gmu.shape[0], -1)
        s = self._bohm_zech_s
        if s is None:
            return _multinomial_chi2_grad(n, mu, gmu)
        # use original n and mu because Bohm-Zech scale factor cancels
        s = s.reshape(-1)
        return _poisson_chi2_grad(n, mu, s * gmu)


class ExtendedBinnedNLL(BinnedCostWithModel):
    """
    Binned extended negative log-likelihood.

    Use this if shape and normalization of the fitted PDF are of interest and the data
    is binned. This cost function works with normal and weighted histograms. The
    histogram can be one- or multi-dimensional.

    The cost function works for both weighted data. The cost function assumes that
    the weights are independent of the data. This is not the case for sWeights, and
    the uncertaintes for results obtained with sWeights will only be approximately
    correct, see C. Langenbruch, Eur.Phys.J.C 82 (2022) 5, 393.

    The cost function has a minimum value that is asymptotically chi2-distributed. It is
    constructed from the log-likelihood assuming a poisson distribution and using the
    saturated model as a reference.
    """

    __slots__ = ()

    @property
    def scaled_cdf(self):
        """Get integrated density model."""
        return self._model

    def __init__(
        self,
        n: ArrayLike,
        xe: Union[ArrayLike, Sequence[ArrayLike]],
        scaled_cdf: Model,
        *,
        verbose: int = 0,
        grad: Optional[ModelGradient] = None,
        use_pdf: str = "",
        name: Optional[Sequence[str]] = None,
    ):
        """
        Initialize cost function with data and model.

        Parameters
        ----------
        n : array-like
            Histogram counts. If this is an array with dimension D+1, where D is the
            number of histogram axes, then the last dimension must have two elements
            and is interpreted as pairs of sum of weights and sum of weights squared.
        xe : array-like or collection of array-like
            Bin edge locations, must be len(n) + 1, where n is the number of bins.
            If the histogram has more than one axis, xe must be a collection of the
            bin edge locations along each axis.
        scaled_cdf : callable
            Scaled Cumulative density function of the form f(xe, par0, [par1, ...]),
            where xe is a bin edge and par0, ... are model parameters.  If the model is
            multivariate, xe must be an array-like with shape (D, N), where D is the
            dimension and N is the number of points where the model is evaluated.
        verbose : int, optional
            Verbosity level. 0: is no output (default). 1: print current args and
            negative log-likelihood value.
        grad: callable or None, optional
            Optionally pass the gradient of the cdf (Default is None). Has the same
            calling signature like the cdf, but must return an array with the shape (K,
            N), where N is the number of data points and K is the number of parameters.
            The gradient can be used by Minuit to improve or speed up convergence.
        use_pdf: str, optional
            Either "", "numerical", or "approximate". If the model cdf is not available,
            but the model pdf is, this option can be set to "numerical" or "approximate"
            to compute the integral of the pdf over the bin patch. The option
            "numerical" uses numerical integration, which is accurate but
            computationally expensive and only supported for 1D histograms. The option
            "approximate" uses the zero-order approximation of evaluating the pdf at the
            bin center, multiplied with the bin area. This is fast and works in higher
            dimensions, but can lead to biased results if the curvature of the pdf
            inside the bin is significant.
        name : sequence of str or None, optional
            Optional names for each parameter of the model (in order). Must have the
            same length as there are model parameters. Default is None.
        """
        super().__init__(n, xe, scaled_cdf, verbose, grad, use_pdf, name)

    def _value(self, args: Sequence[float]) -> float:
        mu = self._pred(args)
        n, mu = self._transformed(mu)
        return poisson_chi2(n.reshape(-1), mu.reshape(-1))

    def _grad(self, args: Sequence[float]) -> NDArray:
        mu = self._pred(args)
        gmu = self._pred_grad(args)
        ma = self.mask
        if ma is not None:
            mu = mu[ma]
            gmu = gmu[:, ma]
        mu = mu.reshape(-1)
        gmu = gmu.reshape(gmu.shape[0], -1)
        n = self._counts().reshape(-1)
        s = self._bohm_zech_s
        if s is None:
            return _poisson_chi2_grad(n, mu, gmu)
        # use original n and mu because Bohm-Zech scale factor cancels
        s = s.reshape(-1)
        return _poisson_chi2_grad(n, mu, s * gmu)


class LeastSquares(MaskedCostWithPulls):
    """
    Least-squares cost function (aka chisquare function).

    Use this if you have data of the form (x, y +/- yerror), where x can be
    one-dimensional or multi-dimensional, but y is always one-dimensional. See
    :meth:`__init__` for details on how to use a multivariate model.
    """

    __slots__ = "_loss", "_cost", "_cost_grad", "_model", "_model_grad", "_ndim"

    _loss: Union[str, LossFunction]
    _cost: Callable[[ArrayLike, ArrayLike, ArrayLike], float]
    _cost_grad: Optional[Callable[[NDArray, NDArray, NDArray, NDArray], NDArray]]
    _model: Model
    _model_grad: Optional[ModelGradient]
    _ndim: int

    @property
    def x(self):
        """Get explanatory variables."""
        if self._ndim == 1:
            return self.data[:, 0]
        return self.data.T[: self._ndim]

    @x.setter
    def x(self, value):
        if self._ndim == 1:
            self.data[:, 0] = _norm(value)
        else:
            self.data[:, : self._ndim] = _norm(value).T
        self._update_cache()

    @property
    def y(self):
        """Get samples."""
        return self.data[:, self._ndim]

    @y.setter
    def y(self, value):
        self.data[:, self._ndim] = _norm(value)
        self._update_cache()

    @property
    def yerror(self):
        """Get sample uncertainties."""
        return self.data[:, self._ndim + 1]

    @yerror.setter
    def yerror(self, value):
        self.data[:, self._ndim + 1] = _norm(value)
        self._update_cache()

    @property
    def model(self):
        """Get model of the form y = f(x, par0, [par1, ...])."""
        if len(self._parameters) == 1:
            return lambda x, *args: (
                self._model(x, args) if len(args) > 1 else self._model(x, *args)
            )
        else:
            return self._model

    @property
    def loss(self):
        """Get loss function."""
        return self._loss

    @loss.setter
    def loss(self, loss: Union[str, LossFunction]):
        self._loss = loss
        if isinstance(loss, str):
            if loss == "linear":
                self._cost = chi2
                self._cost_grad = _chi2_grad
            elif loss == "soft_l1":
                self._cost = _soft_l1_cost  # type: ignore
                self._cost_grad = _soft_l1_cost_grad
            else:
                raise ValueError(f"unknown loss {loss!r}")
        elif isinstance(loss, LossFunction):
            self._cost = lambda y, ye, ym: np.sum(
                loss(_z_squared(y, ye, ym))  # type:ignore
            )
            self._cost_grad = None
        else:
            raise ValueError("loss must be str or LossFunction")

    def __init__(
        self,
        x: ArrayLike,
        y: ArrayLike,
        yerror: ArrayLike,
        model: Model,
        *,
        loss: Union[str, LossFunction] = "linear",
        verbose: int = 0,
        grad: Optional[ModelGradient] = None,
        name: Optional[Sequence[str]] = None,
    ):
        """
        Initialize cost function with data and model.

        Parameters
        ----------
        x : array-like
            Locations where the model is evaluated. If the model is multivariate, x must
            have shape (D, N), where D is the number of dimensions and N the number of
            data points.
        y : array-like
            Observed values. Must have the same length as x.
        yerror : array-like or float
            Estimated uncertainty of observed values. Must have same shape as y or be a
            scalar, which is then broadcasted to same shape as y.
        model : callable
            Function of the form f(x, par0, [par1, ...]) whose output is compared to
            observed values, where x is the location and par0, ... are model parameters.
            If the model is multivariate, x has shape (D, N), where D is the number
            of dimensions and N the number of data points.
        loss : str or callable, optional
            The loss function can be modified to make the fit robust against outliers,
            see scipy.optimize.least_squares for details. Only "linear" (default) and
            "soft_l1" are currently implemented, but users can pass any loss function as
            this argument. It should be a monotonic, twice differentiable function,
            which accepts the squared residual and returns a modified squared residual.
        verbose : int, optional
            Verbosity level. 0: is no output (default). 1: print current args and
            negative log-likelihood value.

        Notes
        -----
        Alternative loss functions make the fit more robust against outliers by
        weakening the pull of outliers. The mechanical analog of a least-squares fit is
        a system with attractive forces. The points pull the model towards them with a
        force whose potential is given by :math:`rho(z)` for a squared-offset :math:`z`.
        The plot shows the standard potential in comparison with the weaker soft-l1
        potential, in which outliers act with a constant force independent of their
        distance.

        .. plot:: plots/loss.py
        """
        x = _norm(x)
        y = _norm(y)
        assert x.ndim >= 1  # guaranteed by _norm

        self._ndim = x.shape[0] if x.ndim > 1 else 1
        self._model = model
        self._model_grad = grad
        self.loss = loss

        x = np.atleast_2d(x)
        data = np.column_stack(np.broadcast_arrays(*x, y, yerror))
        super().__init__(_model_parameters(model, name), data, verbose)

    def _ndata(self):
        return len(self._masked)

    def visualize(
        self, args: ArrayLike, model_points: Union[int, Sequence[float]] = 0
    ) -> Tuple[Tuple[NDArray, NDArray, NDArray], Tuple[NDArray, NDArray]]:
        """
        Visualize data and model agreement (requires matplotlib).

        The visualization is drawn with matplotlib.pyplot into the current axes.

        Parameters
        ----------
        args : array-like
            Parameter values.

        model_points : int or array-like, optional
            How many points to use to draw the model. Default is 0, in this case
            an smart sampling algorithm selects the number of points. If array-like,
            it is interpreted as the point locations.
        """
        from matplotlib import pyplot as plt

        if self._ndim > 1:
            raise ValueError("visualize is not implemented for multi-dimensional data")

        x, y, ye = self._masked.T
        plt.errorbar(x, y, ye, fmt="ok")
        if isinstance(model_points, Iterable):
            xm = np.array(model_points)
            ym = self.model(xm, *args)
        elif model_points > 0:
            if _detect_log_spacing(x):
                xm = np.geomspace(x[0], x[-1], model_points)
            else:
                xm = np.linspace(x[0], x[-1], model_points)
            ym = self.model(xm, *args)
        else:
            xm, ym = _smart_sampling(lambda x: self.model(x, *args), x[0], x[-1])
        plt.plot(xm, ym)
        return (x, y, ye), (xm, ym)

    def prediction(self, args: Sequence[float]) -> NDArray:
        """
        Return the prediction from the fitted model.

        Parameters
        ----------
        args : array-like
            Parameter values.

        Returns
        -------
        NDArray
            Model prediction for each bin.
        """
        return self.model(self.x, *args)

    def _pulls(self, args: Sequence[float]) -> NDArray:
        y = self.y.copy()
        ye = self.yerror.copy()
        ym = self.prediction(args)

        if self.mask is not None:
            ma = ~self.mask
            y[ma] = np.nan
            ye[ma] = np.nan
        return (y - ym) / ye

    def _pred(self, args: Sequence[float]) -> NDArray:
        x = self._masked.T[0] if self._ndim == 1 else self._masked.T[: self._ndim]
        ym = self._model(x, *args)
        return _normalize_output(ym, "model", self._ndata())

    def _pred_grad(self, args: Sequence[float]) -> NDArray:
        if self._model_grad is None:
            raise ValueError("no gradient available")  # pragma: no cover
        x = self._masked.T[0] if self._ndim == 1 else self._masked.T[: self._ndim]
        ymg = self._model_grad(x, *args)
        return _normalize_output(ymg, "model gradient", self.npar, self._ndata())

    def _value(self, args: Sequence[float]) -> float:
        y, ye = self._masked.T[self._ndim :]
        ym = self._pred(args)
        return self._cost(y, ye, ym)

    def _grad(self, args: Sequence[float]) -> NDArray:
        if self._cost_grad is None:
            raise ValueError("no cost gradient available")  # pragma: no cover
        y, ye = self._masked.T[self._ndim :]
        ym = self._pred(args)
        ymg = self._pred_grad(args)
        return self._cost_grad(y, ye, ym, ymg)

    def _has_grad(self) -> bool:
        return self._model_grad is not None and self._cost_grad is not None


class NormalConstraint(Cost):
    """
    Gaussian penalty for one or several parameters.

    The Gaussian penalty acts like a pseudo-measurement of the parameter itself, based
    on a (multi-variate) normal distribution. Penalties can be set for one or several
    parameters at once (which is more efficient). When several parameter are
    constrained, one can specify the full covariance matrix of the parameters.

    Notes
    -----
    It is sometimes necessary to add a weak penalty on a parameter to avoid
    instabilities in the fit. A typical example in high-energy physics is the fit of a
    signal peak above some background. If the amplitude of the peak vanishes, the shape
    parameters of the peak become unconstrained and the fit becomes unstable. This can
    be avoided by adding weak (large uncertainty) penalty on the shape parameters whose
    pull is negligible if the peak amplitude is non-zero.

    This class can also be used to approximately include external measurements of some
    parameters, if the original cost function is not available or too costly to compute.
    If the external measurement was performed in the asymptotic limit with a large
    sample, a Gaussian penalty is an accurate statistical representation of the external
    result.
    """

    __slots__ = "_expected", "_cov", "_covinv"

    def __init__(
        self,
        args: Union[str, Iterable[str]],
        value: ArrayLike,
        error: ArrayLike,
    ):
        """
        Initialize the normal constraint with expected value(s) and error(s).

        Parameters
        ----------
        args : str or sequence of str
            Parameter name(s).
        value : float or array-like
            Expected value(s). Must have same length as `args`.
        error : float or array-like
            Expected error(s). If 1D, must have same length as `args`. If 2D, must be
            the covariance matrix of the parameters.
        """
        tp_args = (args,) if isinstance(args, str) else tuple(args)
        nargs = len(tp_args)
        self._expected = _norm(value)
        if self._expected.ndim > 1:
            raise ValueError("value must be a scalar or one-dimensional")
        # args can be a vector of values, in this case we have nargs == 1
        if nargs > 1 and len(self._expected) != nargs:
            raise ValueError("size of value does not match size of args")
        self._cov = _norm(error)
        if len(self._cov) != len(self._expected):
            raise ValueError("size of error does not match size of value")
        if self._cov.ndim < 2:
            self._cov **= 2
        elif self._cov.ndim == 2:
            if not is_positive_definite(self._cov):
                raise ValueError("covariance matrix is not positive definite")
        else:
            raise ValueError("covariance matrix cannot have more than two dimensions")
        self._covinv = _covinv(self._cov)
        super().__init__({k: None for k in tp_args}, False)

    @property
    def covariance(self):
        """
        Get expected covariance of parameters.

        Can be 1D (diagonal of covariance matrix) or 2D (full covariance matrix).
        """
        return self._cov

    @covariance.setter
    def covariance(self, value):
        value = np.asarray(value)
        if value.ndim == 2 and not is_positive_definite(value):
            raise ValueError("covariance matrix is not positive definite")
        self._cov[:] = value
        self._covinv = _covinv(self._cov)

    @property
    def value(self):
        """Get expected parameter values."""
        return self._expected

    @value.setter
    def value(self, value):
        self._expected[:] = value

    def _value(self, args: Sequence[float]) -> float:
        delta = args - self._expected
        if self._covinv.ndim < 2:
            return np.sum(delta**2 * self._covinv)
        return np.einsum("i,ij,j", delta, self._covinv, delta)

    def _grad(self, args: Sequence[float]) -> NDArray:
        delta = args - self._expected
        if self._covinv.ndim < 2:
            return 2 * delta * self._covinv
        return 2 * self._covinv @ delta

    def _has_grad(self) -> bool:
        return True

    def _ndata(self):
        return len(self._expected)

    def visualize(self, args: ArrayLike):
        """
        Visualize data and model agreement (requires matplotlib).

        The visualization is drawn with matplotlib.pyplot into the current axes.

        Parameters
        ----------
        args : array-like
            Parameter values.
        """
        from matplotlib import pyplot as plt

        args = np.atleast_1d(args)

        par = self._parameters
        val = self.value
        cov = self.covariance
        if cov.ndim == 2:
            cov = np.diag(cov)
        err = np.sqrt(cov)

        n = len(par)

        i = 0
        max_pull = 0
        for v, e, a in zip(val, err, args):
            pull = (a - v) / e
            max_pull = max(abs(pull), max_pull)
            plt.errorbar(pull, -i, 0, 1, fmt="o", color="C0")
            i += 1
        plt.axvline(0, color="k")
        plt.xlim(-max_pull - 1.1, max_pull + 1.1)
        yaxis = plt.gca().yaxis
        yaxis.set_ticks(-np.arange(n))
        yaxis.set_ticklabels(par)
        plt.ylim(-n + 0.5, 0.5)


def _norm(value: ArrayLike) -> NDArray:
    value = np.atleast_1d(value)
    dtype = value.dtype
    if dtype.kind != "f":
        value = value.astype(np.float64)
    return value


def _covinv(array):
    return np.linalg.inv(array) if array.ndim == 2 else 1.0 / array


def _normalize_output(x, kind, *shape, msg=None):
    if not isinstance(x, np.ndarray):
        if msg is None:
            msg = f"{kind} should return numpy array, but returns {type(x)}"
        else:
            msg = f"{kind} should return numpy array {msg}, but returns {type(x)}"
        warnings.warn(msg, PerformanceWarning)
        x = np.array(x)
        if x.dtype.kind != "f":
            return x.astype(float)
    if x.ndim < len(shape):
        return x.reshape(*shape)
    elif x.shape != shape:
        # NumPy 2 uses a numpy int here
        pretty_shape = tuple(int(i) for i in shape)
        msg = (
            f"output of {kind} has shape {x.shape!r}, but {pretty_shape!r} is required"
        )
        raise ValueError(msg)
    return x


def _shape_from_xe(xe):
    if isinstance(xe[0], Iterable):
        return tuple(len(xei) - 1 for xei in xe)
    return (len(xe) - 1,)


def _model_parameters(model, name):
    # strip first argument from model
    ann = describe(model, annotations=True)
    args = iter(ann)
    next(args)
    params = {k: ann[k] for k in args}
    if name:
        if len(params) == len(name):
            params = {n: att for (n, att) in zip(name, params.values())}
        elif len(params) > 0:
            raise ValueError("length of name does not match number of model parameters")
        else:
            params = {n: None for n in name}
    return params


_deprecated_content = {
    "BarlowBeestonLite": ("Template", Template),
    "barlow_beeston_lite_chi2_jsc": ("template_chi2_jsc", template_chi2_jsc),
    "barlow_beeston_lite_chi2_hpd": ("template_chi2_da", template_chi2_da),
    "multinominal_chi2": ("multinomial_chi2", multinomial_chi2),
}


def __getattr__(name: str) -> Any:
    if name in _deprecated_content:
        new_name, obj = _deprecated_content[name]
        warnings.warn(
            f"{name} was renamed to {new_name}, please import {new_name} instead",
            FutureWarning,
            stacklevel=2,
        )
        return obj

    raise AttributeError