001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.statistics.inference;
019
020import java.util.Arrays;
021import java.util.Objects;
022import java.util.function.DoubleSupplier;
023import java.util.function.DoubleUnaryOperator;
024import java.util.function.IntToDoubleFunction;
025import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
026import org.apache.commons.numbers.combinatorics.Factorial;
027import org.apache.commons.numbers.core.ArithmeticUtils;
028import org.apache.commons.numbers.core.Sum;
029import org.apache.commons.rng.UniformRandomProvider;
030
031/**
032 * Implements the Kolmogorov-Smirnov (K-S) test for equality of continuous distributions.
033 *
034 * <p>The one-sample test uses a D statistic based on the maximum deviation of the empirical
035 * distribution of sample data points from the distribution expected under the null hypothesis.
036 *
037 * <p>The two-sample test uses a D statistic based on the maximum deviation of the two empirical
038 * distributions of sample data points. The two-sample tests evaluate the null hypothesis that
039 * the two samples {@code x} and {@code y} come from the same underlying distribution.
040 *
041 * <p>References:
042 * <ol>
043 * <li>
044 * Marsaglia, G., Tsang, W. W., &amp; Wang, J. (2003).
045 * <a href="https://doi.org/10.18637/jss.v008.i18">Evaluating Kolmogorov's Distribution.</a>
046 * Journal of Statistical Software, 8(18), 1–4.</li>
047 * <li>Simard, R., &amp; L’Ecuyer, P. (2011).
048 * <a href="https://doi.org/10.18637/jss.v039.i11">Computing the Two-Sided Kolmogorov-Smirnov Distribution.</a>
049 * Journal of Statistical Software, 39(11), 1–18.</li>
050 * <li>Sekhon, J. S. (2011).
051 * <a href="https://doi.org/10.18637/jss.v042.i07">
052 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
053 * The Matching package for R.</a>
054 * Journal of Statistical Software, 42(7), 1–52.</li>
055 * <li>Viehmann, T (2021).
056 * <a href="https://doi.org/10.48550/arXiv.2102.08037">
057 * Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.</a>
058 * arXiv:2102.08037</li>
059 * <li>Hodges, J. L. (1958).
060 * <a href="https://doi.org/10.1007/BF02589501">
061 * The significance probability of the smirnov two-sample test.</a>
062 * Arkiv for Matematik, 3(5), 469-486.</li>
063 * </ol>
064 *
065 * <p>Note that [1] contains an error in computing h, refer to <a
066 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
067 *
068 * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
069 * Kolmogorov-Smirnov (K-S) test (Wikipedia)</a>
070 * @since 1.1
071 */
072public final class KolmogorovSmirnovTest {
073    /** Name for sample 1. */
074    private static final String SAMPLE_1_NAME = "Sample 1";
075    /** Name for sample 2. */
076    private static final String SAMPLE_2_NAME = "Sample 2";
077    /** When the largest sample size exceeds this value, 2-sample test AUTO p-value
078     * uses an asymptotic distribution to compute the p-value. */
079    private static final int LARGE_SAMPLE = 10000;
080    /** Maximum finite factorial. */
081    private static final int MAX_FACTORIAL = 170;
082    /** Maximum length of an array. This is used to determine if two arrays can be concatenated
083     * to create a sampler from the joint distribution. The limit is copied from the limit
084     * of java.util.ArrayList. */
085    private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
086    /** The maximum least common multiple (lcm) to attempt the exact p-value computation.
087     * The integral d value is in [0, n*m] in steps of the greatest common denominator (gcd),
088     * thus lcm = n*m/gcd is the number of possible different p-values.
089     * Some methods have a lower limit due to computation limits. This should be larger
090     * than LARGE_SAMPLE^2 so all AUTO p-values attempt an exact computation, i.e.
091     * at least 10000^2 ~ 2^26.56. */
092    private static final long MAX_LCM_TWO_SAMPLE_EXACT_P = 1L << 31;
093    /** Placeholder to use for the two-sample sign array when the value can be ignored. */
094    private static final int[] IGNORED_SIGN = new int[1];
095    /** Placeholder to use for the two-sample ties D array when the value can be ignored. */
096    private static final long[] IGNORED_D = new long[2];
097    /** Default instance. */
098    private static final KolmogorovSmirnovTest DEFAULT = new KolmogorovSmirnovTest(
099        AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, false, null, 1000);
100
101    /** Alternative hypothesis. */
102    private final AlternativeHypothesis alternative;
103    /** Method to compute the p-value. */
104    private final PValueMethod pValueMethod;
105    /** Use a strict inequality for the two-sample exact p-value. */
106    private final boolean strictInequality;
107    /** Source of randomness. */
108    private final UniformRandomProvider rng;
109    /** Number of iterations . */
110    private final int iterations;
111
112    /**
113     * Result for the one-sample Kolmogorov-Smirnov test.
114     *
115     * <p>This class is immutable.
116     *
117     * @since 1.1
118     */
119    public static class OneResult extends BaseSignificanceResult {
120        /** Sign of the statistic. */
121        private final int sign;
122
123        /**
124         * Create an instance.
125         *
126         * @param statistic Test statistic.
127         * @param sign Sign of the statistic.
128         * @param p Result p-value.
129         */
130        OneResult(double statistic, int sign, double p) {
131            super(statistic, p);
132            this.sign = sign;
133        }
134
135        /**
136         * Gets the sign of the statistic. This is 1 for \(D^+\) and -1 for \(D^-\).
137         * For a two sided-test this is zero if the magnitude of \(D^+\) and \(D^-\) was equal;
138         * otherwise the sign indicates the larger of \(D^+\) or \(D^-\).
139         *
140         * @return the sign
141         */
142        public int getSign() {
143            return sign;
144        }
145    }
146
147    /**
148     * Result for the two-sample Kolmogorov-Smirnov test.
149     *
150     * <p>This class is immutable.
151     *
152     * @since 1.1
153     */
154    public static final class TwoResult extends OneResult {
155        /** Flag to indicate there were significant ties.
156         * Note that in extreme cases there may be significant ties despite {@code upperD == D}
157         * due to rounding when converting the integral statistic to a double. For this
158         * reason the presence of ties is stored as a flag. */
159        private final boolean significantTies;
160        /** Upper bound of the D statistic from all possible paths through regions with ties. */
161        private final double upperD;
162        /** The p-value of the upper D value. */
163        private final double upperP;
164
165        /**
166         * Create an instance.
167         *
168         * @param statistic Test statistic.
169         * @param sign Sign of the statistic.
170         * @param p Result p-value.
171         * @param significantTies Flag to indicate there were significant ties.
172         * @param upperD Upper bound of the D statistic from all possible paths through
173         * regions with ties.
174         * @param upperP The p-value of the upper D value.
175         */
176        TwoResult(double statistic, int sign, double p, boolean significantTies, double upperD, double upperP) {
177            super(statistic, sign, p);
178            this.significantTies = significantTies;
179            this.upperD = upperD;
180            this.upperP = upperP;
181        }
182
183        /**
184         * {@inheritDoc}
185         *
186         * <p><strong>Ties</strong>
187         *
188         * <p>The presence of ties in the data creates a distribution for the D values generated
189         * by all possible orderings of the tied regions. This statistic is computed using the
190         * path with the <em>minimum</em> effect on the D statistic.
191         *
192         * <p>For a one-sided statistic \(D^+\) or \(D^-\), this is the lower bound of the D statistic.
193         *
194         * <p>For a two-sided statistic D, this may be <em>below</em> the lower bound of the
195         * distribution of all possible D values. This case occurs when the number of ties
196         * is very high and is identified by a {@link #getPValue() p-value} of 1.
197         *
198         * <p>If the two-sided statistic is zero this only occurs in the presence of ties:
199         * either the two arrays are identical, are 'identical' data of a single value
200         * (sample sizes may be different), or have a sequence of ties of 'identical' data
201         * with a net zero effect on the D statistic, e.g.
202         * <pre>
203         *  [1,2,3]           vs [1,2,3]
204         *  [0,0,0,0]         vs [0,0,0]
205         *  [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
206         * </pre>
207         */
208        @Override
209        public double getStatistic() {
210            // Note: This method is here for documentation
211            return super.getStatistic();
212        }
213
214        /**
215         * Returns {@code true} if there were ties between samples that occurred
216         * in a region which could change the D statistic if the ties were resolved to
217         * a defined order.
218         *
219         * <p>Ties between the data can be interpreted as if the values were different
220         * but within machine epsilon. In this case the order within the tie region is not known.
221         * If the most extreme ordering of any tied regions (e.g. all tied values of {@code x}
222         * before all tied values of {@code y}) could create a larger D statistic this
223         * method will return {@code true}.
224         *
225         * <p>If there were no ties, or all possible orderings of tied regions create the same
226         * D statistic, this method returns {@code false}.
227         *
228         * <p>Note it is possible that this method returns {@code true} when {@code D == upperD}
229         * due to rounding when converting the computed D statistic to a double. This will
230         * only occur for large sample sizes {@code n} and {@code m} where the product
231         * {@code n*m >= 2^53}.
232         *
233         * @return true if the D statistic could be changed by resolution of ties
234         * @see #getUpperD()
235         */
236        public boolean hasSignificantTies() {
237            return significantTies;
238        }
239
240        /**
241         * Return the upper bound of the D statistic from all possible paths through regions with ties.
242         *
243         * <p>This will return a value equal to or greater than {@link #getStatistic()}.
244         *
245         * @return the upper bound of D
246         * @see #hasSignificantTies()
247         */
248        public double getUpperD() {
249            return upperD;
250        }
251
252        /**
253         * Return the p-value of the upper bound of the D statistic.
254         *
255         * <p>If computed, this will return a value equal to or less than
256         * {@link #getPValue() getPValue}. It may be orders of magnitude smaller.
257         *
258         * <p>Note: This p-value corresponds to the most extreme p-value from all possible
259         * orderings of tied regions. It is <strong>not</strong> recommended to use this to
260         * reject the null hypothesis. The upper bound of D and the corresponding p-value
261         * provide information that must be interpreted in the context of the sample data and
262         * used to inform a decision on the suitability of the test to the data.
263         *
264         * <p>This value is set to {@link Double#NaN NaN} if the {@link #getPValue() p-value} was
265         * {@linkplain PValueMethod#ESTIMATE estimated}. The estimated p-value will have been created
266         * using a distribution of possible D values given the underlying joint distribution of
267         * the sample data. Comparison of the p-value to the upper p-value is not applicable.
268         *
269         * @return the p-value of the upper bound of D (or NaN)
270         * @see #getUpperD()
271         */
272        public double getUpperPValue() {
273            return upperP;
274        }
275    }
276
277    /**
278     * @param alternative Alternative hypothesis.
279     * @param method P-value method.
280     * @param strict true to use a strict inequality.
281     * @param rng Source of randomness.
282     * @param iterations Number of iterations.
283     */
284    private KolmogorovSmirnovTest(AlternativeHypothesis alternative, PValueMethod method, boolean strict,
285        UniformRandomProvider rng, int iterations) {
286        this.alternative = alternative;
287        this.pValueMethod = method;
288        this.strictInequality = strict;
289        this.rng = rng;
290        this.iterations = iterations;
291    }
292
293    /**
294     * Return an instance using the default options.
295     *
296     * <ul>
297     * <li>{@link AlternativeHypothesis#TWO_SIDED}</li>
298     * <li>{@link PValueMethod#AUTO}</li>
299     * <li>{@link Inequality#NON_STRICT}</li>
300     * <li>{@linkplain #with(UniformRandomProvider) RNG = none}</li>
301     * <li>{@linkplain #withIterations(int) Iterations = 1000}</li>
302     * </ul>
303     *
304     * @return default instance
305     */
306    public static KolmogorovSmirnovTest withDefaults() {
307        return DEFAULT;
308    }
309
310    /**
311     * Return an instance with the configured alternative hypothesis.
312     *
313     * @param v Value.
314     * @return an instance
315     */
316    public KolmogorovSmirnovTest with(AlternativeHypothesis v) {
317        return new KolmogorovSmirnovTest(Objects.requireNonNull(v), pValueMethod, strictInequality, rng, iterations);
318    }
319
320    /**
321     * Return an instance with the configured p-value method.
322     *
323     * <p>For the one-sample two-sided test Kolmogorov's asymptotic approximation can be used;
324     * otherwise the p-value uses the distribution of the D statistic.
325     *
326     * <p>For the two-sample test the exact p-value can be computed for small sample sizes;
327     * otherwise the p-value resorts to the asymptotic approximation. Alternatively a p-value
328     * can be estimated from the combined distribution of the samples. This requires a source
329     * of randomness.
330     *
331     * @param v Value.
332     * @return an instance
333     * @see #with(UniformRandomProvider)
334     */
335    public KolmogorovSmirnovTest with(PValueMethod v) {
336        return new KolmogorovSmirnovTest(alternative, Objects.requireNonNull(v), strictInequality, rng, iterations);
337    }
338
339    /**
340     * Return an instance with the configured inequality.
341     *
342     * <p>Computes the p-value for the two-sample test as \(P(D_{n,m} &gt; d)\) if strict;
343     * otherwise \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample
344     * Kolmogorov-Smirnov statistic, either the two-sided \(D_{n,m}\) or one-sided
345     * \(D_{n,m}^+\) or \(D_{n,m}^-\).
346     *
347     * @param v Value.
348     * @return an instance
349     */
350    public KolmogorovSmirnovTest with(Inequality v) {
351        return new KolmogorovSmirnovTest(alternative, pValueMethod,
352            Objects.requireNonNull(v) == Inequality.STRICT, rng, iterations);
353    }
354
355    /**
356     * Return an instance with the configured source of randomness.
357     *
358     * <p>Applies to the two-sample test when the p-value method is set to
359     * {@link PValueMethod#ESTIMATE ESTIMATE}. The randomness
360     * is used for sampling of the combined distribution.
361     *
362     * <p>There is no default source of randomness. If the randomness is not
363     * set then estimation is not possible and an {@link IllegalStateException} will be
364     * raised in the two-sample test.
365     *
366     * @param v Value.
367     * @return an instance
368     * @see #with(PValueMethod)
369     */
370    public KolmogorovSmirnovTest with(UniformRandomProvider v) {
371        return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality,
372            Objects.requireNonNull(v), iterations);
373    }
374
375    /**
376     * Return an instance with the configured number of iterations.
377     *
378     * <p>Applies to the two-sample test when the p-value method is set to
379     * {@link PValueMethod#ESTIMATE ESTIMATE}. This is the number of sampling iterations
380     * used to estimate the p-value. The p-value is a fraction using the {@code iterations}
381     * as the denominator. The number of significant digits in the p-value is
382     * upper bounded by log<sub>10</sub>(iterations); small p-values have fewer significant
383     * digits. A large number of iterations is recommended when using a small critical
384     * value to reject the null hypothesis.
385     *
386     * @param v Value.
387     * @return an instance
388     * @throws IllegalArgumentException if the number of iterations is not strictly positive
389     */
390    public KolmogorovSmirnovTest withIterations(int v) {
391        return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality, rng,
392            Arguments.checkStrictlyPositive(v));
393    }
394
395    /**
396     * Computes the one-sample Kolmogorov-Smirnov test statistic.
397     *
398     * <ul>
399     * <li>two-sided: \(D_n=\sup_x |F_n(x)-F(x)|\)</li>
400     * <li>greater: \(D_n^+=\sup_x (F_n(x)-F(x))\)</li>
401     * <li>less: \(D_n^-=\sup_x (F(x)-F_n(x))\)</li>
402     * </ul>
403     *
404     * <p>where \(F\) is the distribution cumulative density function ({@code cdf}),
405     * \(n\) is the length of {@code x} and \(F_n\) is the empirical distribution that puts
406     * mass \(1/n\) at each of the values in {@code x}.
407     *
408     * <p>The cumulative distribution function should map a real value {@code x} to a probability
409     * in [0, 1]. To use a reference distribution the CDF can be passed using a method reference:
410     * <pre>
411     * UniformContinuousDistribution dist = UniformContinuousDistribution.of(0, 1);
412     * UniformRandomProvider rng = RandomSource.KISS.create(123);
413     * double[] x = dist.sampler(rng).samples(100);
414     * double d = KolmogorovSmirnovTest.withDefaults().statistic(x, dist::cumulativeProbability);
415     * </pre>
416     *
417     * @param cdf Reference cumulative distribution function.
418     * @param x Sample being evaluated.
419     * @return Kolmogorov-Smirnov statistic
420     * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
421     * @see #test(double[], DoubleUnaryOperator)
422     */
423    public double statistic(double[] x, DoubleUnaryOperator cdf) {
424        return computeStatistic(x, cdf, IGNORED_SIGN);
425    }
426
427    /**
428     * Computes the two-sample Kolmogorov-Smirnov test statistic.
429     *
430     * <ul>
431     * <li>two-sided: \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)</li>
432     * <li>greater: \(D_{n,m}^+=\sup_x (F_n(x)-F_m(x))\)</li>
433     * <li>less: \(D_{n,m}^-=\sup_x (F_m(x)-F_n(x))\)</li>
434     * </ul>
435     *
436     * <p>where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
437     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
438     * is the empirical distribution that puts mass \(1/m\) at each of the values in {@code y}.
439     *
440     * @param x First sample.
441     * @param y Second sample.
442     * @return Kolmogorov-Smirnov statistic
443     * @throws IllegalArgumentException if either {@code x} or {@code y} does not
444     * have length at least 2; or contain NaN values.
445     * @see #test(double[], double[])
446     */
447    public double statistic(double[] x, double[] y) {
448        final int n = checkArrayLength(x);
449        final int m = checkArrayLength(y);
450        // Clone to avoid destructive modification of input
451        final long dnm = computeIntegralKolmogorovSmirnovStatistic(x.clone(), y.clone(),
452                IGNORED_SIGN, IGNORED_D);
453        // Re-use the method to compute D in [0, 1] for consistency
454        return computeD(dnm, n, m, ArithmeticUtils.gcd(n, m));
455    }
456
457    /**
458     * Performs a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis
459     * that {@code x} conforms to the distribution cumulative density function ({@code cdf}).
460     *
461     * <p>The test is defined by the {@link AlternativeHypothesis}:
462     * <ul>
463     * <li>Two-sided evaluates the null hypothesis that the two distributions are
464     * identical, \(F_n(i) = F(i)\) for all \( i \); the alternative is that the are not
465     * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.</li>
466     * <li>Greater evaluates the null hypothesis that the \(F_n(i) &lt;= F(i)\) for all \( i \);
467     * the alternative is \(F_n(i) &gt; F(i)\) for at least one \( i \). The statistic is \( D_n^+ \).</li>
468     * <li>Less evaluates the null hypothesis that the \(F_n(i) &gt;= F(i)\) for all \( i \);
469     * the alternative is \(F_n(i) &lt; F(i)\) for at least one \( i \). The statistic is \( D_n^- \).</li>
470     * </ul>
471     *
472     * <p>The p-value method defaults to exact. The one-sided p-value uses Smirnov's stable formula:
473     *
474     * <p>\[ P(D_n^+ \ge x) = x \sum_{j=0}^{\lfloor n(1-x) \rfloor} \binom{n}{j} \left(\frac{j}{n} + x\right)^{j-1} \left(1-x-\frac{j}{n} \right)^{n-j} \]
475     *
476     * <p>The two-sided p-value is computed using methods described in
477     * Simard &amp; L’Ecuyer (2011). The two-sided test supports an asymptotic p-value
478     * using Kolmogorov's formula:
479     *
480     * <p>\[ \lim_{n\to\infty} P(\sqrt{n}D_n &gt; z) = 2 \sum_{i=1}^\infty (-1)^{i-1} e^{-2 i^2 z^2} \]
481     *
482     * @param x Sample being being evaluated.
483     * @param cdf Reference cumulative distribution function.
484     * @return test result
485     * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
486     * @see #statistic(double[], DoubleUnaryOperator)
487     */
488    public OneResult test(double[] x, DoubleUnaryOperator cdf) {
489        final int[] sign = {0};
490        final double d = computeStatistic(x, cdf, sign);
491        final double p;
492        if (alternative == AlternativeHypothesis.TWO_SIDED) {
493            PValueMethod method = pValueMethod;
494            if (method == PValueMethod.AUTO) {
495                // No switch to the asymptotic for large n
496                method = PValueMethod.EXACT;
497            }
498            if (method == PValueMethod.ASYMPTOTIC) {
499                // Kolmogorov's asymptotic formula using z = sqrt(n) * d
500                p = KolmogorovSmirnovDistribution.ksSum(Math.sqrt(x.length) * d);
501            } else {
502                // exact
503                p = KolmogorovSmirnovDistribution.Two.sf(d, x.length);
504            }
505        } else {
506            // one-sided: always use exact
507            p = KolmogorovSmirnovDistribution.One.sf(d, x.length);
508        }
509        return new OneResult(d, sign[0], p);
510    }
511
512    /**
513     * Performs a two-sample Kolmogorov-Smirnov test on samples {@code x} and {@code y}.
514     * Test the empirical distributions \(F_n\) and \(F_m\) where \(n\) is the length
515     * of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the empirical distribution
516     * that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) is the empirical
517     * distribution that puts mass \(1/m\) of the {@code y} values.
518     *
519     * <p>The test is defined by the {@link AlternativeHypothesis}:
520     * <ul>
521     * <li>Two-sided evaluates the null hypothesis that the two distributions are
522     * identical, \(F_n(i) = F_m(i)\) for all \( i \); the alternative is that they are not
523     * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.</li>
524     * <li>Greater evaluates the null hypothesis that the \(F_n(i) &lt;= F_m(i)\) for all \( i \);
525     * the alternative is \(F_n(i) &gt; F_m(i)\) for at least one \( i \). The statistic is \( D_n^+ \).</li>
526     * <li>Less evaluates the null hypothesis that the \(F_n(i) &gt;= F_m(i)\) for all \( i \);
527     * the alternative is \(F_n(i) &lt; F_m(i)\) for at least one \( i \). The statistic is \( D_n^- \).</li>
528     * </ul>
529     *
530     * <p>If the {@linkplain PValueMethod p-value method} is auto, then an exact p computation
531     * is attempted if both sample sizes are less than 10000 using the methods presented in
532     * Viehmann (2021) and Hodges (1958); otherwise an asymptotic p-value is returned.
533     * The two-sided p-value is \(\overline{F}(d, \sqrt{mn / (m + n)})\) where \(\overline{F}\)
534     * is the complementary cumulative density function of the two-sided one-sample
535     * Kolmogorov-Smirnov distribution. The one-sided p-value uses an approximation from
536     * Hodges (1958) Eq 5.3.
537     *
538     * <p>\(D_{n,m}\) has a discrete distribution. This makes the p-value associated with the
539     * null hypothesis \(H_0 : D_{n,m} \gt d \) differ from \(H_0 : D_{n,m} \ge d \)
540     * by the mass of the observed value \(d\). These can be distinguished using an
541     * {@link Inequality} parameter. This is ignored for large samples.
542     *
543     * <p>If the data contains ties there is no defined ordering in the tied region to use
544     * for the difference between the two empirical distributions. Each ordering of the
545     * tied region <em>may</em> create a different D statistic. All possible orderings
546     * generate a distribution for the D value. In this case the tied region is traversed
547     * entirely and the effect on the D value evaluated at the end of the tied region.
548     * This is the path with the least change on the D statistic. The path with the
549     * greatest change on the D statistic is also computed as the upper bound on D.
550     * If these two values are different then the tied region is known to generate a
551     * distribution for the D statistic and the p-value is an over estimate for the cases
552     * with a larger D statistic. The presence of any significant tied regions is returned
553     * in the result.
554     *
555     * <p>If the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE} then the p-value is
556     * estimated by repeat sampling of the joint distribution of {@code x} and {@code y}.
557     * The p-value is the frequency that a sample creates a D statistic
558     * greater than or equal to (or greater than for strict inequality) the observed value.
559     * In this case a source of randomness must be configured or an {@link IllegalStateException}
560     * will be raised. The p-value for the upper bound on D will not be estimated and is set to
561     * {@link Double#NaN NaN}. This estimation procedure is not affected by ties in the data
562     * and is increasingly robust for larger datasets. The method is modeled after
563     * <a href="https://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
564     * in the R {@code Matching} package (Sekhon (2011)).
565     *
566     * @param x First sample.
567     * @param y Second sample.
568     * @return test result
569     * @throws IllegalArgumentException if either {@code x} or {@code y} does not
570     * have length at least 2; or contain NaN values.
571     * @throws IllegalStateException if the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE}
572     * and there is no source of randomness.
573     * @see #statistic(double[], double[])
574     */
575    public TwoResult test(double[] x, double[] y) {
576        final int n = checkArrayLength(x);
577        final int m = checkArrayLength(y);
578        PValueMethod method = pValueMethod;
579        final int[] sign = {0};
580        final long[] tiesD = {0, 0};
581
582        final double[] sx = x.clone();
583        final double[] sy = y.clone();
584        final long dnm = computeIntegralKolmogorovSmirnovStatistic(sx, sy, sign, tiesD);
585
586        // Compute p-value. Note that the p-value is not invalidated by ties; it is the
587        // D statistic that could be invalidated by resolution of the ties. So compute
588        // the exact p even if ties are present.
589        if (method == PValueMethod.AUTO) {
590            // Use exact for small samples
591            method = Math.max(n, m) < LARGE_SAMPLE ?
592                PValueMethod.EXACT :
593                PValueMethod.ASYMPTOTIC;
594        }
595        final int gcd = ArithmeticUtils.gcd(n, m);
596        final double d = computeD(dnm, n, m, gcd);
597        final boolean significantTies = tiesD[1] > dnm;
598        final double d2 = significantTies ? computeD(tiesD[1], n, m, gcd) : d;
599
600        final double p;
601        final double p2;
602
603        // Allow bootstrap estimation of the p-value
604        if (method == PValueMethod.ESTIMATE) {
605            p = estimateP(sx, sy, dnm);
606            p2 = Double.NaN;
607        } else {
608            final boolean exact = method == PValueMethod.EXACT;
609            p = twoSampleP(dnm, n, m, gcd, d, exact);
610            if (significantTies) {
611                // Compute the upper bound on D.
612                // The p-value is also computed. The alternative is to save the options
613                // in the result with (upper dnm, n, m) and compute it on-demand.
614                // Note detection of whether the exact P computation is possible is based on
615                // n and m, thus this will use the same computation.
616                p2 = twoSampleP(tiesD[1], n, m, gcd, d2, exact);
617            } else {
618                p2 = p;
619            }
620        }
621        return new TwoResult(d, sign[0], p, significantTies, d2, p2);
622    }
623
624    /**
625     * Estimates the <i>p-value</i> of a two-sample Kolmogorov-Smirnov test evaluating the
626     * null hypothesis that {@code x} and {@code y} are samples drawn from the same
627     * probability distribution.
628     *
629     * <p>This method will destructively modify the input arrays (via a sort).
630     *
631     * <p>This method estimates the p-value by repeatedly sampling sets of size
632     * {@code x.length} and {@code y.length} from the empirical distribution
633     * of the combined sample. The memory requirement is an array copy of each of
634     * the input arguments.
635     *
636     * <p>When using strict inequality, this is equivalent to the algorithm implemented in
637     * the R function {@code ks.boot} as described in Sekhon (2011) Journal of Statistical
638     * Software, 42(7), 1–52 [3].
639     *
640     * @param x First sample.
641     * @param y Second sample.
642     * @param dnm Integral D statistic.
643     * @return p-value
644     * @throws IllegalStateException if the source of randomness is null.
645     */
646    private double estimateP(double[] x, double[] y, long dnm) {
647        if (rng == null) {
648            throw new IllegalStateException("No source of randomness");
649        }
650
651        // Test if the random statistic is greater (strict), or greater or equal to d
652        final long d = strictInequality ? dnm : dnm - 1;
653
654        final long plus;
655        final long minus;
656        if (alternative == AlternativeHypothesis.GREATER_THAN) {
657            plus = d;
658            minus = Long.MIN_VALUE;
659        } else if (alternative == AlternativeHypothesis.LESS_THAN) {
660            plus = Long.MAX_VALUE;
661            minus = -d;
662        } else {
663            // two-sided
664            plus = d;
665            minus = -d;
666        }
667
668        // Test dnm=0. This occurs for example when x == y.
669        if (0 < minus || 0 > plus) {
670            // Edge case where all possible d will be outside the inclusive bounds
671            return 1;
672        }
673
674        // Sample randomly with replacement from the combined distribution.
675        final DoubleSupplier gen = createSampler(x, y, rng);
676        int count = 0;
677        for (int i = iterations; i > 0; i--) {
678            for (int j = 0; j < x.length; j++) {
679                x[j] = gen.getAsDouble();
680            }
681            for (int j = 0; j < y.length; j++) {
682                y[j] = gen.getAsDouble();
683            }
684            if (testIntegralKolmogorovSmirnovStatistic(x, y, plus, minus)) {
685                count++;
686            }
687        }
688        return count / (double) iterations;
689    }
690
691    /**
692     * Computes the magnitude of the one-sample Kolmogorov-Smirnov test statistic.
693     * The sign of the statistic is optionally returned in {@code sign}. For the two-sided case
694     * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
695     * had the larger magnitude.
696     *
697     * @param x Sample being evaluated.
698     * @param cdf Reference cumulative distribution function.
699     * @param sign Sign of the statistic (non-zero length).
700     * @return Kolmogorov-Smirnov statistic
701     * @throws IllegalArgumentException if {@code data} does not have length at least 2;
702     * or contains NaN values.
703     */
704    private double computeStatistic(double[] x, DoubleUnaryOperator cdf, int[] sign) {
705        final int n = checkArrayLength(x);
706        final double nd = n;
707        final double[] sx = sort(x.clone(), "Sample");
708        // Note: ties in the data do not matter as we compare the empirical CDF
709        // immediately before the value (i/n) and at the value (i+1)/n. For ties
710        // of length m this would be (i-m+1)/n and (i+1)/n and the result is the same.
711        double d = 0;
712        if (alternative == AlternativeHypothesis.GREATER_THAN) {
713            // Compute D+
714            for (int i = 0; i < n; i++) {
715                final double yi = cdf.applyAsDouble(sx[i]);
716                final double dp = (i + 1) / nd - yi;
717                d = dp > d ? dp : d;
718            }
719            sign[0] = 1;
720        } else if (alternative == AlternativeHypothesis.LESS_THAN) {
721            // Compute D-
722            for (int i = 0; i < n; i++) {
723                final double yi = cdf.applyAsDouble(sx[i]);
724                final double dn = yi - i / nd;
725                d = dn > d ? dn : d;
726            }
727            sign[0] = -1;
728        } else {
729            // Two sided.
730            // Compute both (as unsigned) and return the sign indicating the largest result.
731            double plus = 0;
732            double minus = 0;
733            for (int i = 0; i < n; i++) {
734                final double yi = cdf.applyAsDouble(sx[i]);
735                final double dn = yi - i / nd;
736                final double dp = (i + 1) / nd - yi;
737                minus = dn > minus ? dn : minus;
738                plus = dp > plus ? dp : plus;
739            }
740            sign[0] = Double.compare(plus, minus);
741            d = Math.max(plus, minus);
742        }
743        return d;
744    }
745
746    /**
747     * Computes the two-sample Kolmogorov-Smirnov test statistic. The statistic is integral
748     * and has a value in {@code [0, n*m]}. Not all values are possible; the smallest
749     * increment is the greatest common divisor of {@code n} and {@code m}.
750     *
751     * <p>This method will destructively modify the input arrays (via a sort).
752     *
753     * <p>The sign of the statistic is returned in {@code sign}. For the two-sided case
754     * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
755     * had the larger magnitude. If the two-sided statistic is zero the two arrays are
756     * identical, or are 'identical' data of a single value (sample sizes may be different),
757     * or have a sequence of ties of 'identical' data with a net zero effect on the D statistic
758     * e.g.
759     * <pre>
760     *  [1,2,3]           vs [1,2,3]
761     *  [0,0,0,0]         vs [0,0,0]
762     *  [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
763     * </pre>
764     *
765     * <p>This method detects ties in the input data. Not all ties will invalidate the returned
766     * statistic. Ties between the data can be interpreted as if the values were different
767     * but within machine epsilon. In this case the path through the tie region is not known.
768     * All paths through the tie region end at the same point. This method will compute the
769     * most extreme path through each tie region and the D statistic for these paths. If the
770     * ties D statistic is a larger magnitude than the returned D statistic then at least
771     * one tie region lies at a point on the full path that could result in a different
772     * statistic in the absence of ties. This signals the P-value computed using the returned
773     * D statistic is one of many possible p-values given the data and how ties are resolved.
774     * Note: The tiesD value may be smaller than the returned D statistic as it is not set
775     * to the maximum of D or tiesD. The only result of interest is when {@code tiesD > D}
776     * due to a tie region that can change the output D. On output {@code tiesD[0] != 0}
777     * if there were ties between samples and {@code tiesD[1] = D} of the most extreme path
778     * through the ties.
779     *
780     * @param x First sample (destructively modified by sort).
781     * @param y Second sample  (destructively modified by sort).
782     * @param sign Sign of the statistic (non-zero length).
783     * @param tiesD Integral statistic for the most extreme path through any ties (length at least 2).
784     * @return integral Kolmogorov-Smirnov statistic
785     * @throws IllegalArgumentException if either {@code x} or {@code y} contain NaN values.
786     */
787    private long computeIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, int[] sign, long[] tiesD) {
788        // Sort the sample arrays
789        sort(x, SAMPLE_1_NAME);
790        sort(y, SAMPLE_2_NAME);
791        final int n = x.length;
792        final int m = y.length;
793
794        // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
795        // Scale by n*m to use increments of m and n for x and y.
796        // Find the max difference between cdf_x and cdf_y.
797        int i = 0;
798        int j = 0;
799        long d = 0;
800        long plus = 0;
801        long minus = 0;
802        // Ties: store the D+,D- for most extreme path though tie region(s)
803        long tplus = 0;
804        long tminus = 0;
805        do {
806            // No NaNs so compare using < and >
807            if (x[i] < y[j]) {
808                final double z = x[i];
809                do {
810                    i++;
811                    d += m;
812                } while (i < n && x[i] == z);
813                plus = d > plus ? d : plus;
814            } else if (x[i] > y[j]) {
815                final double z = y[j];
816                do {
817                    j++;
818                    d -= n;
819                } while (j < m && y[j] == z);
820                minus = d < minus ? d : minus;
821            } else {
822                // Traverse to the end of the tied section for d.
823                // Also compute the most extreme path through the tied region.
824                final double z = x[i];
825                final long dd = d;
826                int k = i;
827                do {
828                    i++;
829                } while (i < n && x[i] == z);
830                k = i - k;
831                d += k * (long) m;
832                // Extreme D+ path
833                tplus = d > tplus ? d : tplus;
834                k = j;
835                do {
836                    j++;
837                } while (j < m && y[j] == z);
838                k = j - k;
839                d -= k * (long) n;
840                // Extreme D- path must start at the original d
841                tminus = Math.min(tminus, dd - k * (long) n);
842                // End of tied section
843                if (d > plus) {
844                    plus = d;
845                } else if (d < minus) {
846                    minus = d;
847                }
848            }
849        } while (i < n && j < m);
850        // The presence of any ties is flagged by a non-zero value for D+ or D-.
851        // Note we cannot use the selected tiesD value as in the one-sided case it may be zero
852        // and the non-selected D value will be non-zero.
853        tiesD[0] = tplus | tminus;
854        // For simplicity the correct tiesD is not returned (correct value is commented).
855        // The only case that matters is tiesD > D which is evaluated by the caller.
856        // Note however that the distance of tiesD < D is a measure of how little the
857        // tied region matters.
858        if (alternative == AlternativeHypothesis.GREATER_THAN) {
859            sign[0] = 1;
860            // correct = max(tplus, plus)
861            tiesD[1] = tplus;
862            return plus;
863        } else if (alternative == AlternativeHypothesis.LESS_THAN) {
864            sign[0] = -1;
865            // correct = -min(tminus, minus)
866            tiesD[1] = -tminus;
867            return -minus;
868        } else {
869            // Two sided.
870            sign[0] = Double.compare(plus, -minus);
871            d = Math.max(plus, -minus);
872            // correct = max(d, max(tplus, -tminus))
873            tiesD[1] = Math.max(tplus, -tminus);
874            return d;
875        }
876    }
877
878    /**
879     * Test if the two-sample integral Kolmogorov-Smirnov statistic is strictly greater
880     * than the specified values for D+ and D-. Note that D- should have a negative sign
881     * to impose an inclusive lower bound.
882     *
883     * <p>This method will destructively modify the input arrays (via a sort).
884     *
885     * <p>For a two-sided alternative hypothesis {@code plus} and {@code minus} should have the
886     * same magnitude with opposite signs.
887     *
888     * <p>For a one-sided alternative hypothesis the value of {@code plus} or {@code minus}
889     * should have the expected value of the statistic, and the opposite D should have the maximum
890     * or minimum long value. The {@code minus} should be negatively signed:
891     *
892     * <ul>
893     * <li>greater: {@code plus} = D, {@code minus} = {@link Long#MIN_VALUE}</li>
894     * <li>greater: {@code minus} = -D, {@code plus} = {@link Long#MAX_VALUE}</li>
895     * </ul>
896     *
897     * <p>Note: This method has not been specialized for the one-sided case. Specialization
898     * would eliminate a conditional branch for {@code d > Long.MAX_VALUE} or
899     * {@code d < Long.MIN_VALUE}. Since these branches are never possible in the one-sided case
900     * this should be efficiently chosen by branch prediction in a processor pipeline.
901     *
902     * @param x First sample (destructively modified by sort; must not contain NaN).
903     * @param y Second sample (destructively modified by sort; must not contain NaN).
904     * @param plus Limit on D+ (inclusive upper bound).
905     * @param minus Limit on D- (inclusive lower bound).
906     * @return true if the D value exceeds the provided limits
907     */
908    private static boolean testIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, long plus, long minus) {
909        // Sort the sample arrays
910        Arrays.sort(x);
911        Arrays.sort(y);
912        final int n = x.length;
913        final int m = y.length;
914
915        // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
916        // Scale by n*m to use increments of m and n for x and y.
917        // Find the any difference that exceeds the specified bounds.
918        int i = 0;
919        int j = 0;
920        long d = 0;
921        do {
922            // No NaNs so compare using < and >
923            if (x[i] < y[j]) {
924                final double z = x[i];
925                do {
926                    i++;
927                    d += m;
928                } while (i < n && x[i] == z);
929                if (d > plus) {
930                    return true;
931                }
932            } else if (x[i] > y[j]) {
933                final double z = y[j];
934                do {
935                    j++;
936                    d -= n;
937                } while (j < m && y[j] == z);
938                if (d < minus) {
939                    return true;
940                }
941            } else {
942                // Traverse to the end of the tied section for d.
943                final double z = x[i];
944                do {
945                    i++;
946                    d += m;
947                } while (i < n && x[i] == z);
948                do {
949                    j++;
950                    d -= n;
951                } while (j < m && y[j] == z);
952                // End of tied section
953                if (d > plus || d < minus) {
954                    return true;
955                }
956            }
957        } while (i < n && j < m);
958        // Note: Here d requires returning to zero. This is applicable to the one-sided
959        // cases since d may have always been above zero (favours D+) or always below zero
960        // (favours D-). This is ignored as the method is not called when dnm=0 is
961        // outside the inclusive bounds.
962        return false;
963    }
964
965    /**
966     * Creates a sampler to sample randomly from the combined distribution of the two samples.
967     *
968     * @param x First sample.
969     * @param y Second sample.
970     * @param rng Source of randomness.
971     * @return the sampler
972     */
973    private static DoubleSupplier createSampler(double[] x, double[] y,
974                                                UniformRandomProvider rng) {
975        return createSampler(x, y, rng, MAX_ARRAY_SIZE);
976    }
977
978    /**
979     * Creates a sampler to sample randomly from the combined distribution of the two
980     * samples. This will copy the input data for the sampler.
981     *
982     * @param x First sample.
983     * @param y Second sample.
984     * @param rng Source of randomness.
985     * @param maxArraySize Maximum size of a single array.
986     * @return the sampler
987     */
988    static DoubleSupplier createSampler(double[] x, double[] y,
989                                        UniformRandomProvider rng,
990                                        int maxArraySize) {
991        final int n = x.length;
992        final int m = y.length;
993        final int len = n + m;
994        // Overflow safe: len > maxArraySize
995        if (len - maxArraySize > 0) {
996            // Support sampling with maximum length arrays
997            // (where a concatenated array is not possible)
998            // by choosing one or the other.
999            // - generate i in [-n, m)
1000            // - return i < 0 ? x[n + i] : y[i]
1001            // The sign condition is a 50-50 branch.
1002            // Perform branchless by extracting the sign bit to pick the array.
1003            // Copy the source data.
1004            final double[] xx = x.clone();
1005            final double[] yy = y.clone();
1006            final IntToDoubleFunction nextX = i -> xx[n + i];
1007            final IntToDoubleFunction nextY = i -> yy[i];
1008            // Arrange function which accepts the negative index at position [1]
1009            final IntToDoubleFunction[] next = {nextY, nextX};
1010            return () -> {
1011                final int i = rng.nextInt(-n, m);
1012                return next[i >>> 31].applyAsDouble(i);
1013            };
1014        }
1015        // Concatenate arrays
1016        final double[] z = new double[len];
1017        System.arraycopy(x, 0, z, 0, n);
1018        System.arraycopy(y, 0, z, n, m);
1019        return () -> z[rng.nextInt(len)];
1020    }
1021
1022    /**
1023     * Compute the D statistic from the integral D value.
1024     *
1025     * @param dnm Integral D-statistic value (in [0, n*m]).
1026     * @param n First sample size.
1027     * @param m Second sample size.
1028     * @param gcd Greatest common divisor of n and m.
1029     * @return D-statistic value (in [0, 1]).
1030     */
1031    private static double computeD(long dnm, int n, int m, int gcd) {
1032        // Note: Integer division using the gcd is intentional
1033        final long a = dnm / gcd;
1034        final int b = m / gcd;
1035        return a / ((double) n * b);
1036    }
1037
1038    /**
1039     * Computes \(P(D_{n,m} &gt; d)\) for the 2-sample Kolmogorov-Smirnov statistic.
1040     *
1041     * @param dnm Integral D-statistic value (in [0, n*m]).
1042     * @param n First sample size.
1043     * @param m Second sample size.
1044     * @param gcd Greatest common divisor of n and m.
1045     * @param d D-statistic value (in [0, 1]).
1046     * @param exact whether to compute the exact probability; otherwise approximate.
1047     * @return probability
1048     * @see #twoSampleExactP(long, int, int, int, boolean, boolean)
1049     * @see #twoSampleApproximateP(double, int, int, boolean)
1050     */
1051    private double twoSampleP(long dnm, int n, int m, int gcd, double d, boolean exact) {
1052        // Exact computation returns -1 if it cannot compute.
1053        double p = -1;
1054        if (exact) {
1055            p = twoSampleExactP(dnm, n, m, gcd, strictInequality, alternative == AlternativeHypothesis.TWO_SIDED);
1056        }
1057        if (p < 0) {
1058            p = twoSampleApproximateP(d, n, m, alternative == AlternativeHypothesis.TWO_SIDED);
1059        }
1060        return p;
1061    }
1062
1063    /**
1064     * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
1065     * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic, either the two-sided
1066     * \(D_{n,m}\) or one-sided \(D_{n,m}^+\}. See
1067     * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
1068     *
1069     * <p>The returned probability is exact. If the value cannot be computed this returns -1.
1070     *
1071     * <p>Note: This requires the greatest common divisor of n and m. The integral D statistic
1072     * in the range [0, n*m] is separated by increments of the gcd. The method will only
1073     * compute p-values for valid values of D by calculating for D/gcd.
1074     * Strict inquality is performed using the next valid value for D.
1075     *
1076     * @param dnm Integral D-statistic value (in [0, n*m]).
1077     * @param n First sample size.
1078     * @param m Second sample size.
1079     * @param gcd Greatest common divisor of n and m.
1080     * @param strict whether or not the probability to compute is expressed as a strict inequality.
1081     * @param twoSided whether D refers to D or D+.
1082     * @return probability that a randomly selected m-n partition of m + n generates D
1083     *         greater than (resp. greater than or equal to) {@code d} (or -1)
1084     */
1085    static double twoSampleExactP(long dnm, int n, int m, int gcd, boolean strict, boolean twoSided) {
1086        // Create the statistic in [0, lcm]
1087        // For strict inequality D > d the result is the same if we compute for D >= (d+1)
1088        final long d = dnm / gcd + (strict ? 1 : 0);
1089
1090        // P-value methods compute for d <= lcm (least common multiple)
1091        final long lcm = (long) n * (m / gcd);
1092        if (d > lcm) {
1093            return 0;
1094        }
1095
1096        // Note: Some methods require m >= n, others n >= m
1097        final int a = Math.min(n, m);
1098        final int b = Math.max(n, m);
1099
1100        if (twoSided) {
1101            // Any two-sided statistic dnm cannot be less than min(n, m) in the absence of ties.
1102            if (d * gcd <= a) {
1103                return 1;
1104            }
1105            // Here d in [2, lcm]
1106            if (n == m) {
1107                return twoSampleTwoSidedPOutsideSquare(d, n);
1108            }
1109            return twoSampleTwoSidedPStabilizedInner(d, b, a, gcd);
1110        }
1111        // Any one-sided statistic cannot be less than 0
1112        if (d <= 0) {
1113            return 1;
1114        }
1115        // Here d in [1, lcm]
1116        if (n == m) {
1117            return twoSampleOneSidedPOutsideSquare(d, n);
1118        }
1119        return twoSampleOneSidedPOutside(d, a, b, gcd);
1120    }
1121
1122    /**
1123     * Computes \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
1124     *
1125     * <p>The returned probability is exact, implemented using the stabilized inner method
1126     * presented in Viehmann (2021).
1127     *
1128     * <p>This is optimized for {@code m <= n}. If {@code m > n} and index-out-of-bounds
1129     * exception can occur.
1130     *
1131     * @param d Integral D-statistic value (in [2, lcm])
1132     * @param n Larger sample size.
1133     * @param m Smaller sample size.
1134     * @param gcd Greatest common divisor of n and m.
1135     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1136     *         greater than or equal to {@code d}
1137     */
1138    private static double twoSampleTwoSidedPStabilizedInner(long d, int n, int m, int gcd) {
1139        // Check the computation is possible.
1140        // Note that the possible paths is binom(m+n, n).
1141        // However the computation is stable above this limit.
1142        // Possible d values (requiring a unique p-value) = max(dnm) / gcd = lcm(n, m).
1143        // Possible terms to compute <= n * size(cij)
1144        // Simple limit based on the number of possible different p-values
1145        if ((long) n * (m / gcd) > MAX_LCM_TWO_SAMPLE_EXACT_P) {
1146            return -1;
1147        }
1148
1149        // This could be updated to use d in [1, lcm].
1150        // Currently it uses d in [gcd, n*m].
1151        // Largest intermediate value is (dnm + im + n) which is within 2^63
1152        // if n and m are 2^31-1, i = n, dnm = n*m: (2^31-1)^2 + (2^31-1)^2 + 2^31-1 < 2^63
1153        final long dnm = d * gcd;
1154
1155        // Viehmann (2021): Updated for i in [0, n], j in [0, m]
1156        // C_i,j = 1                                      if |i/n - j/m| >= d
1157        //       = 0                                      if |i/n - j/m| < d and (i=0 or j=0)
1158        //       = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)  otherwise
1159        // P2 = C_n,m
1160        // Note: The python listing in Viehmann used d in [0, 1]. This uses dnm in [0, nm]
1161        // so updates the scaling to compute the ranges. Also note that the listing uses
1162        // dist > d or dist < -d where this uses |dist| >= d to compute P(D >= d) (non-strict inequality).
1163        // The provided listing is explicit in the values for each j in the range.
1164        // It can be optimized given the known start and end j for each iteration as only
1165        // j where |i/n - j/m| < d must be processed:
1166        // startJ where: im - jn < dnm : jn > im - dnm
1167        // j = floor((im - dnm) / n) + 1      in [0, m]
1168        // endJ where: jn - im >= dnm
1169        // j = ceil((dnm + im) / n)           in [0, m+1]
1170
1171        // First iteration with i = 0
1172        // j = ceil(dnm / n)
1173        int endJ = Math.min(m + 1, (int) ((dnm + n - 1) / n));
1174
1175        // Only require 1 array to store C_i-1,j as the startJ only ever increases
1176        // and we update lower indices using higher ones.
1177        // The maximum value *written* is j=m or less using j/m <= 2*d : j = ceil(2*d*m)
1178        // Viehmann uses: size = int(2*m*d + 2) == ceil(2*d*m) + 1
1179        // The maximum value *read* is j/m <= 2*d. This may be above m. This occurs when
1180        // j - lastStartJ > m and C_i-1,j = 1. This can be avoided if (startJ - lastStartJ) <= 1
1181        // which occurs if m <= n, i.e. the window only slides 0 or 1 in j for each increment i
1182        // and we can maintain Cij as 1 larger than ceil(2*d*m) + 1.
1183        final double[] cij = new double[Math.min(m + 1, 2 * endJ + 2)];
1184
1185        // Each iteration fills C_i,j with values and the remaining values are
1186        // kept as 1 for |i/n - j/m| >= d
1187        //assert (endJ - 1) * (long) n < dnm : "jn >= dnm for j < endJ";
1188        for (int j = endJ; j < cij.length; j++) {
1189            //assert j * (long) n >= dnm : "jn < dnm for j >= endJ";
1190            cij[j] = 1;
1191        }
1192
1193        int startJ = 0;
1194        int length = endJ;
1195        double val = -1;
1196        long im = 0;
1197        for (int i = 1; i <= n; i++) {
1198            im += m;
1199            final int lastStartJ = startJ;
1200
1201            // Compute C_i,j for startJ <= j < endJ
1202            // startJ = floor((im - dnm) / n) + 1      in [0, m]
1203            // endJ   = ceil((dnm + im) / n)           in [0, m+1]
1204            startJ = im < dnm ? 0 : Math.min(m, (int) ((im - dnm) / n) + 1);
1205            endJ = Math.min(m + 1, (int) ((dnm + im + n - 1) / n));
1206
1207            if (startJ >= endJ) {
1208                // No possible paths inside the boundary
1209                return 1;
1210            }
1211
1212            //assert startJ - lastStartJ <= 1 : "startJ - lastStartJ > 1";
1213
1214            // Initialize previous value C_i,j-1
1215            val = startJ == 0 ? 0 : 1;
1216
1217            //assert startJ == 0 || Math.abs(im - (startJ - 1) * (long) n) >= dnm : "|im - jn| < dnm for j < startJ";
1218            //assert endJ > m || Math.abs(im - endJ * (long) n) >= dnm : "|im - jn| < dnm for j >= endJ";
1219            for (int j = startJ; j < endJ; j++) {
1220                //assert j == 0 || Math.abs(im - j * (long) n) < dnm : "|im - jn| >= dnm for startJ <= j < endJ";
1221                // C_i,j = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)
1222                // Note: if (j - lastStartJ) >= cij.length this creates an IOOB exception.
1223                // In this case cij[j - lastStartJ] == 1. Only happens when m >= n.
1224                // Fixed using c_i-1,j = (j - lastStartJ >= cij.length ? 1 : cij[j - lastStartJ]
1225                val = (cij[j - lastStartJ] * i + val * j) / ((double) i + j);
1226                cij[j - startJ] = val;
1227            }
1228
1229            // Must keep the remaining values in C_i,j as 1 to allow
1230            // cij[j - lastStartJ] * i == i when (j - lastStartJ) > lastLength
1231            final int lastLength = length;
1232            length = endJ - startJ;
1233            for (int j = lastLength - length - 1; j >= 0; j--) {
1234                cij[length + j] = 1;
1235            }
1236        }
1237        // Return the most recently written value: C_n,m
1238        return val;
1239    }
1240
1241    /**
1242     * Computes \(P(D_{n,m}^+ \ge d)\), where \(D_{n,m}^+\) is the 2-sample one-sided
1243     * Kolmogorov-Smirnov statistic.
1244     *
1245     * <p>The returned probability is exact, implemented using the outer method
1246     * presented in Hodges (1958).
1247     *
1248     * <p>This method will fail-fast and return -1 if the computation of the
1249     * numbers of paths overflows.
1250     *
1251     * @param d Integral D-statistic value (in [0, lcm])
1252     * @param n First sample size.
1253     * @param m Second sample size.
1254     * @param gcd Greatest common divisor of n and m.
1255     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1256     *         greater than or equal to {@code d}
1257     */
1258    private static double twoSampleOneSidedPOutside(long d, int n, int m, int gcd) {
1259        // Hodges, Fig.2
1260        // Lower boundary: (nx - my)/nm >= d : (nx - my) >= dnm
1261        // B(x, y) is the number of ways from (0, 0) to (x, y) without previously
1262        // reaching the boundary.
1263        // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
1264        // Total paths:
1265        // sum_y { B(x, y) binom(m+n-x-y, n-y) }
1266
1267        // Normalized by binom(m+n, n). Check this is possible.
1268        final long lm = m;
1269        if (n + lm > Integer.MAX_VALUE) {
1270            return -1;
1271        }
1272        final double binom = binom(m + n, n);
1273        if (binom == Double.POSITIVE_INFINITY) {
1274            return -1;
1275        }
1276
1277        // This could be updated to use d in [1, lcm].
1278        // Currently it uses d in [gcd, n*m].
1279        final long dnm = d * gcd;
1280
1281        // Visit all x in [0, m] where (nx - my) >= d for each increasing y in [0, n].
1282        // x = ceil( (d + my) / n ) = (d + my + n - 1) / n
1283        // y = ceil( (nx - d) / m ) = (nx - d + m - 1) / m
1284        // Note: n m integer, d in [0, nm], the intermediate cannot overflow a long.
1285        // x | y=0 = (d + n - 1) / n
1286        final int x0 = (int) ((dnm + n - 1) / n);
1287        if (x0 >= m) {
1288            return 1 / binom;
1289        }
1290        // The y above is the y *on* the boundary. Set the limit as the next y above:
1291        // y | x=m = 1 + floor( (nx - d) / m ) = 1 + (nm - d) / m
1292        final int maxy = (int) ((n * lm - dnm + m) / m);
1293        // Compute x and B(x, y) for visited B(x,y)
1294        final int[] xy = new int[maxy];
1295        final double[] bxy = new double[maxy];
1296        xy[0] = x0;
1297        bxy[0] = 1;
1298        for (int y = 1; y < maxy; y++) {
1299            final int x = (int) ((dnm + lm * y + n - 1) / n);
1300            // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
1301            // Add the terms to subtract as a negative sum.
1302            final Sum b = Sum.create();
1303            for (int yy = 0; yy < y; yy++) {
1304                // Here: previousX = x - xy[yy] : previousY = y - yy
1305                // bxy[yy] is the paths to (previousX, previousY)
1306                // binom represent the paths from (previousX, previousY) to (x, y)
1307                b.addProduct(bxy[yy], -binom(x - xy[yy] + y - yy, y - yy));
1308            }
1309            b.add(binom(x + y, y));
1310            xy[y] = x;
1311            bxy[y] = b.getAsDouble();
1312        }
1313        // sum_y { B(x, y) binom(m+n-x-y, n-y) }
1314        final Sum sum = Sum.create();
1315        for (int y = 0; y < maxy; y++) {
1316            sum.addProduct(bxy[y], binom(m + n - xy[y] - y, n - y));
1317        }
1318        // No individual term should have overflowed since binom is finite.
1319        // Any sum above 1 is floating-point error.
1320        return KolmogorovSmirnovDistribution.clipProbability(sum.getAsDouble() / binom);
1321    }
1322
1323    /**
1324     * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample one-sided
1325     * Kolmogorov-Smirnov statistic.
1326     *
1327     * <p>The returned probability is exact, implemented using the outer method
1328     * presented in Hodges (1958).
1329     *
1330     * @param d Integral D-statistic value (in [1, lcm])
1331     * @param n Sample size.
1332     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1333     *         greater than or equal to {@code d}
1334     */
1335    private static double twoSampleOneSidedPOutsideSquare(long d, int n) {
1336        // Hodges (1958) Eq. 2.3:
1337        // p = binom(2n, n-a) / binom(2n, n)
1338        // a in [1, n] == d * n == dnm / n
1339        final int a = (int) d;
1340
1341        // Rearrange:
1342        // p = ( 2n! / ((n-a)! (n+a)!) ) / ( 2n! / (n! n!) )
1343        //   = n! n! / ( (n-a)! (n+a)! )
1344        // Perform using pre-computed factorials if possible.
1345        if (n + a <= MAX_FACTORIAL) {
1346            final double x = Factorial.doubleValue(n);
1347            final double y = Factorial.doubleValue(n - a);
1348            final double z = Factorial.doubleValue(n + a);
1349            return (x / y) * (x / z);
1350        }
1351        // p = n! / (n-a)!  *  n! / (n+a)!
1352        //       n * (n-1) * ... * (n-a+1)
1353        //   = -----------------------------
1354        //     (n+a) * (n+a-1) * ... * (n+1)
1355
1356        double p = 1;
1357        for (int i = 0; i < a && p != 0; i++) {
1358            p *= (n - i) / (1.0 + n + i);
1359        }
1360        return p;
1361    }
1362
1363    /**
1364     * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample two-sided
1365     * Kolmogorov-Smirnov statistic.
1366     *
1367     * <p>The returned probability is exact, implemented using the outer method
1368     * presented in Hodges (1958).
1369     *
1370     * @param d Integral D-statistic value (in [1, n])
1371     * @param n Sample size.
1372     * @return probability that a randomly selected m-n partition of n + n generates \(D_{n,n}\)
1373     *         greater than or equal to {@code d}
1374     */
1375    private static double twoSampleTwoSidedPOutsideSquare(long d, int n) {
1376        // Hodges (1958) Eq. 2.4:
1377        // p = 2 [ binom(2n, n-a) - binom(2n, n-2a) + binom(2n, n-3a) - ... ] / binom(2n, n)
1378        // a in [1, n] == d * n == dnm / n
1379
1380        // As per twoSampleOneSidedPOutsideSquare, divide by binom(2n, n) and each term
1381        // can be expressed as a product:
1382        //         (             n - i                    n - i                   n - i         )
1383        // p = 2 * ( prod_i=0^a --------- - prod_i=0^2a --------- + prod_i=0^3a --------- + ... )
1384        //         (           1 + n + i                1 + n + i               1 + n + i       )
1385        // for ja in [1, ..., n/a]
1386        // Avoid repeat computation of terms by extracting common products:
1387        // p = 2 * ( p0a * (1 - p1a * (1 - p2a * (1 - ... ))) )
1388        // where each term pja is prod_i={ja}^{ja+a} for all j in [1, n / a]
1389
1390        // The first term is the one-sided p.
1391        final double p0a = twoSampleOneSidedPOutsideSquare(d, n);
1392        if (p0a == 0) {
1393            // Underflow - nothing more to do
1394            return 0;
1395        }
1396        // Compute the inner-terms from small to big.
1397        // j = n / (d/n) ~ n*n / d
1398        // j is a measure of how extreme the d value is (small j is extreme d).
1399        // When j is above 0 a path may traverse from the lower boundary to the upper boundary.
1400        final int a = (int) d;
1401        double p = 0;
1402        for (int j = n / a; j > 0; j--) {
1403            double pja = 1;
1404            final int jaa = j * a + a;
1405            // Since p0a did not underflow we avoid the check for pj != 0
1406            for (int i = j * a; i < jaa; i++) {
1407                pja *= (n - i) / (1.0 + n + i);
1408            }
1409            p = pja * (1 - p);
1410        }
1411        p = p0a * (1 - p);
1412        return Math.min(1, 2 * p);
1413    }
1414
1415    /**
1416     * Compute the binomial coefficient binom(n, k).
1417     *
1418     * @param n N.
1419     * @param k K.
1420     * @return binom(n, k)
1421     */
1422    private static double binom(int n, int k) {
1423        return BinomialCoefficientDouble.value(n, k);
1424    }
1425
1426    /**
1427     * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
1428     * is the 2-sample Kolmogorov-Smirnov statistic. See
1429     * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
1430     *
1431     * <p>Specifically, what is returned is \(1 - CDF(d, \sqrt{mn / (m + n)})\) where CDF
1432     * is the cumulative density function of the two-sided one-sample Kolmogorov-Smirnov
1433     * distribution.
1434     *
1435     * @param d D-statistic value.
1436     * @param n First sample size.
1437     * @param m Second sample size.
1438     * @param twoSided True to compute the two-sided p-value; else one-sided.
1439     * @return approximate probability that a randomly selected m-n partition of m + n generates
1440     *         \(D_{n,m}\) greater than {@code d}
1441     */
1442    static double twoSampleApproximateP(double d, int n, int m, boolean twoSided) {
1443        final double nn = Math.min(n, m);
1444        final double mm = Math.max(n, m);
1445        if (twoSided) {
1446            // Smirnov's asymptotic formula:
1447            // P(sqrt(N) D_n > x)
1448            // N = m*n/(m+n)
1449            return KolmogorovSmirnovDistribution.Two.sf(d, (int) Math.round(mm * nn / (mm + nn)));
1450        }
1451        // one-sided
1452        // Use Hodges Eq 5.3. Requires m >= n
1453        // Correct for m=n, m an integral multiple of n, and 'on the average' for m nearly equal to n
1454        final double z = d * Math.sqrt(nn * mm / (nn + mm));
1455        return Math.exp(-2 * z * z - 2 * z * (mm + 2 * nn) / Math.sqrt(mm * nn * (mm + nn)) / 3);
1456    }
1457
1458    /**
1459     * Verifies that {@code array} has length at least 2.
1460     *
1461     * @param array Array to test.
1462     * @return the length
1463     * @throws IllegalArgumentException if array is too short
1464     */
1465    private static int checkArrayLength(double[] array) {
1466        final int n = array.length;
1467        if (n <= 1) {
1468            throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
1469        }
1470        return n;
1471    }
1472
1473    /**
1474     * Sort the input array. Throws an exception if NaN values are
1475     * present. It is assumed the array is non-zero length.
1476     *
1477     * @param x Input array.
1478     * @param name Name of the array.
1479     * @return a reference to the input (sorted) array
1480     * @throws IllegalArgumentException if {@code x} contains NaN values.
1481     */
1482    private static double[] sort(double[] x, String name) {
1483        Arrays.sort(x);
1484        // NaN will be at the end
1485        if (Double.isNaN(x[x.length - 1])) {
1486            throw new InferenceException(name + " contains NaN");
1487        }
1488        return x;
1489    }
1490}