001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.statistics.descriptive;
019
020import java.util.Objects;
021import org.apache.commons.numbers.arrays.Selection;
022
023/**
024 * Returns the median of the available values.
025 *
026 * <p>For values of length {@code n}, let {@code k = n / 2}:
027 * <ul>
028 * <li>The result is {@code NaN} if {@code n = 0}.</li>
029 * <li>The result is {@code values[k]} if {@code n} is odd.</li>
030 * <li>The result is {@code (values[k - 1] + values[k]) / 2} if {@code n} is even.</li>
031 * </ul>
032 *
033 * <p>This implementation respects the ordering imposed by
034 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs
035 * in the selected positions in the fully sorted values then the result is {@code NaN}.
036 *
037 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values.
038 *
039 * <p>Instances of this class are immutable and thread-safe.
040 *
041 * <p><strong>Support for {@code long} arrays</strong>
042 *
043 * <p>The result on {@code long} values can be returned as a {@code double} or
044 * a {@code long} using a {@link StatisticResult}.
045 *
046 * <p>The {@code double} result is the closest representable value
047 * following floating-point rounding rules. This may be outside the
048 * range defined by the minimum and maximum of the input array following
049 * rounding to a 53-bit floating point representation.
050 * For example the median of an array containing only {@link Long#MAX_VALUE}
051 * as a {@code double} is 2<sup>63</sup>, which is the closest representation of
052 * 2<sup>63</sup> - 1.
053 *
054 * <p>The {@code long} result is returned using the nearest whole number.
055 * In the event of ties the result is rounded towards positive infinity.
056 * For example the median of {@code [2, 3]} is 3, and of {@code [-2, -3]} is -2.
057 * This value will always be within the range defined by the minimum and maximum
058 * of the input array. Due to interpolation it may be a value not observed in
059 * the input values.
060 *
061 * <p>If the array length {@code n} is zero the result as a {@code double} is
062 * {@code NaN} and the result as a {@code long} will raise an {@link ArithmeticException}.
063 *
064 * @see #with(NaNPolicy)
065 * @see <a href="https://en.wikipedia.org/wiki/Median">Median (Wikipedia)</a>
066 * @since 1.1
067 */
068public final class Median {
069    /** Default instance. */
070    private static final Median DEFAULT = new Median(false, NaNPolicy.INCLUDE);
071
072    /** Flag to indicate if the data should be copied. */
073    private final boolean copy;
074    /** NaN policy for floating point data. */
075    private final NaNPolicy nanPolicy;
076    /** Transformer for NaN data. */
077    private final NaNTransformer nanTransformer;
078
079    /**
080     * @param copy Flag to indicate if the data should be copied.
081     * @param nanPolicy NaN policy.
082     */
083    private Median(boolean copy, NaNPolicy nanPolicy) {
084        this.copy = copy;
085        this.nanPolicy = nanPolicy;
086        nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy);
087    }
088
089    /**
090     * Return a new instance with the default options.
091     *
092     * <ul>
093     * <li>{@linkplain #withCopy(boolean) Copy = false}</li>
094     * <li>{@linkplain #with(NaNPolicy) NaN policy = include}</li>
095     * </ul>
096     *
097     * <p>Note: The default options configure for processing in-place and including
098     * {@code NaN} values in the data. This is the most efficient mode and has the
099     * smallest memory consumption.
100     *
101     * @return the median implementation
102     * @see #withCopy(boolean)
103     * @see #with(NaNPolicy)
104     */
105    public static Median withDefaults() {
106        return DEFAULT;
107    }
108
109    /**
110     * Return an instance with the configured copy behaviour. If {@code false} then
111     * the input array will be modified by the call to evaluate the median; otherwise
112     * the computation uses a copy of the data.
113     *
114     * @param v Value.
115     * @return an instance
116     */
117    public Median withCopy(boolean v) {
118        return new Median(v, nanPolicy);
119    }
120
121    /**
122     * Return an instance with the configured {@link NaNPolicy}.
123     *
124     * <p>Note: This implementation respects the ordering imposed by
125     * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is
126     * considered greater than all other values, and all {@code NaN} values are equal. The
127     * {@link NaNPolicy} changes the computation of the statistic in the presence of
128     * {@code NaN} values.
129     *
130     * <ul>
131     * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data;
132     * the size of the data <em>includes</em> the {@code NaN} values and the median will be
133     * {@code NaN} if any value used for median interpolation is {@code NaN}.</li>
134     * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data;
135     * the size of the data <em>excludes</em> the {@code NaN} values and the median will
136     * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero
137     * and the result is {@code NaN}.</li>
138     * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN}
139     * values.</li>
140     * </ul>
141     *
142     * <p>Note that the result is identical for all policies if no {@code NaN} values are present.
143     *
144     * @param v Value.
145     * @return an instance
146     */
147    public Median with(NaNPolicy v) {
148        return new Median(copy, Objects.requireNonNull(v));
149    }
150
151    /**
152     * Evaluate the median.
153     *
154     * <p>Note: This method may partially sort the input values if not configured to
155     * {@link #withCopy(boolean) copy} the input data.
156     *
157     * @param values Values.
158     * @return the median
159     * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
160     * @see #with(NaNPolicy)
161     */
162    public double evaluate(double[] values) {
163        return compute(values, 0, values.length);
164    }
165
166    /**
167     * Evaluate the median of the specified range.
168     *
169     * <p>Note: This method may partially sort the input values if not configured to
170     * {@link #withCopy(boolean) copy} the input data.
171     *
172     * @param values Values.
173     * @param from Inclusive start of the range.
174     * @param to Exclusive end of the range.
175     * @return the median
176     * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
177     * @throws IndexOutOfBoundsException if the sub-range is out of bounds
178     * @see #with(NaNPolicy)
179     * @since 1.2
180     */
181    public double evaluateRange(double[] values, int from, int to) {
182        Statistics.checkFromToIndex(from, to, values.length);
183        return compute(values, from, to);
184    }
185
186    /**
187     * Compute the median of the specified range.
188     *
189     * @param values Values.
190     * @param from Inclusive start of the range.
191     * @param to Exclusive end of the range.
192     * @return the median
193     */
194    private double compute(double[] values, int from, int to) {
195        // Floating-point data handling
196        final int[] bounds = new int[2];
197        final double[] x = nanTransformer.apply(values, from, to, bounds);
198        final int start = bounds[0];
199        final int end = bounds[1];
200        final int n = end - start;
201        // Special cases
202        if (n <= 2) {
203            switch (n) {
204            case 2:
205                // Sorting the array matches the behaviour of Quantile for n==2
206                // Handle NaN and signed zeros
207                if (Double.compare(x[start + 1], x[start]) < 0) {
208                    final double t = x[start];
209                    x[start] = x[start + 1];
210                    x[start + 1] = t;
211                }
212                return Interpolation.mean(x[start], x[start + 1]);
213            case 1:
214                return x[start];
215            default:
216                return Double.NaN;
217            }
218        }
219        // Median index (including the offset)
220        final int m = (start + end) >>> 1;
221        // Odd
222        if ((n & 0x1) == 1) {
223            Selection.select(x, start, end, m);
224            return x[m];
225        }
226        // Even: require (m-1, m)
227        Selection.select(x, start, end, new int[] {m - 1, m});
228        return Interpolation.mean(x[m - 1], x[m]);
229    }
230
231    /**
232     * Evaluate the median.
233     *
234     * <p>Note: This method may partially sort the input values if not configured to
235     * {@link #withCopy(boolean) copy} the input data.
236     *
237     * @param values Values.
238     * @return the median
239     */
240    public double evaluate(int[] values) {
241        return compute(values, 0, values.length);
242    }
243
244    /**
245     * Evaluate the median of the specified range.
246     *
247     * <p>Note: This method may partially sort the input values if not configured to
248     * {@link #withCopy(boolean) copy} the input data.
249     *
250     * @param values Values.
251     * @param from Inclusive start of the range.
252     * @param to Exclusive end of the range.
253     * @return the median
254     * @throws IndexOutOfBoundsException if the sub-range is out of bounds
255     * @since 1.2
256     */
257    public double evaluateRange(int[] values, int from, int to) {
258        Statistics.checkFromToIndex(from, to, values.length);
259        return compute(values, from, to);
260    }
261
262    /**
263     * Compute the median of the specified range.
264     *
265     * @param values Values.
266     * @param from Inclusive start of the range.
267     * @param to Exclusive end of the range.
268     * @return the median
269     */
270    private double compute(int[] values, int from, int to) {
271        final int[] x;
272        final int start;
273        final int end;
274        if (copy) {
275            x = Statistics.copy(values, from, to);
276            start = 0;
277            end = x.length;
278        } else {
279            x = values;
280            start = from;
281            end = to;
282        }
283        final int n = end - start;
284        // Special cases
285        if (n <= 2) {
286            switch (n) {
287            case 2:
288                // Sorting the array matches the behaviour of Quantile for n==2
289                if (x[start + 1] < x[start]) {
290                    final int t = x[start];
291                    x[start] = x[start + 1];
292                    x[start + 1] = t;
293                }
294                return Interpolation.mean(x[start], x[start + 1]);
295            case 1:
296                return x[start];
297            default:
298                return Double.NaN;
299            }
300        }
301        // Median index (including the offset)
302        final int m = (start + end) >>> 1;
303        // Odd
304        if ((n & 0x1) == 1) {
305            Selection.select(x, start, end, m);
306            return x[m];
307        }
308        // Even: require (m-1, m)
309        Selection.select(x, start, end, new int[] {m - 1, m});
310        return Interpolation.mean(x[m - 1], x[m]);
311    }
312
313
314    /**
315     * Evaluate the median.
316     *
317     * <p>If the input length is even the result requires interpolation of two values.
318     * The returned median will interpolate the {@code double} or {@code long} result
319     * on demand. This is more efficient when only one result is required. Consumers
320     * of the result should store the appropriate evaluated value if repeated use is
321     * required.
322     *
323     * <p>Note: This method may partially sort the input values if not configured to
324     * {@link #withCopy(boolean) copy} the input data.
325     *
326     * @param values Values.
327     * @return the median
328     * @since 1.3
329     */
330    public StatisticResult evaluate(long[] values) {
331        return compute(values, 0, values.length);
332    }
333
334    /**
335     * Evaluate the median of the specified range.
336     *
337     * <p>If the input sub-range length is even the result requires interpolation of two values.
338     * The returned median will interpolate the {@code double} or {@code long} result
339     * on demand. This is more efficient when only one result is required. Consumers
340     * of the result should store the appropriate evaluated value if repeated use is
341     * required.
342     *
343     * <p>Note: This method may partially sort the input values if not configured to
344     * {@link #withCopy(boolean) copy} the input data.
345     *
346     * @param values Values.
347     * @param from Inclusive start of the range.
348     * @param to Exclusive end of the range.
349     * @return the median
350     * @throws IndexOutOfBoundsException if the sub-range is out of bounds
351     * @since 1.3
352     */
353    public StatisticResult evaluateRange(long[] values, int from, int to) {
354        Statistics.checkFromToIndex(from, to, values.length);
355        return compute(values, from, to);
356    }
357
358    /**
359     * Compute the median of the specified range.
360     *
361     * @param values Values.
362     * @param from Inclusive start of the range.
363     * @param to Exclusive end of the range.
364     * @return the median
365     */
366    private StatisticResult compute(long[] values, int from, int to) {
367        final long[] x;
368        final int start;
369        final int end;
370        if (copy) {
371            x = Statistics.copy(values, from, to);
372            start = 0;
373            end = x.length;
374        } else {
375            x = values;
376            start = from;
377            end = to;
378        }
379        final int n = end - start;
380        // Special cases
381        if (n <= 2) {
382            switch (n) {
383            case 2:
384                // Sorting the array matches the behaviour of Quantile for n==2
385                if (x[start + 1] < x[start]) {
386                    final long t = x[start];
387                    x[start] = x[start + 1];
388                    x[start + 1] = t;
389                }
390                return Interpolation.mean(x[start], x[start + 1]);
391            case 1:
392                return Statistics.createStatisticResult(x[start]);
393            default:
394                return () -> Double.NaN;
395            }
396        }
397        // Median index (including the offset)
398        final int m = (start + end) >>> 1;
399        // Odd
400        if ((n & 0x1) == 1) {
401            Selection.select(x, start, end, m);
402            return Statistics.createStatisticResult(x[m]);
403        }
404        // Even: require (m-1, m)
405        Selection.select(x, start, end, new int[] {m - 1, m});
406        return Interpolation.mean(x[m - 1], x[m]);
407    }
408}