Statistics.java [src/csip/cosu] Revision:   Date:
/*
 * $Id$
 *
 * This file is part of the Cloud Services Integration Platform (CSIP),
 * a Model-as-a-Service framework, API and application suite.
 *
 * 2012-2022, Olaf David and others, OMSLab, Colorado State University.
 *
 * OMSLab licenses this file to you under the MIT license.
 * See the LICENSE file in the project root for more information.
 */
package csip.cosu;

import java.util.Arrays;

/**
 * Statistics basics.
 *
 * @author od
 */
public class Statistics {

  public static final int MAXIMIZATION = 1;
  public static final int MINIMIZATION = 2;
  public static final int ABSMAXIMIZATION = 3;
  public static final int ABSMINIMIZATION = 4;


  private Statistics() {
  }


  /**
   * Normalized Vector.
   */
  public static double norm_vec(double x, double y, double z) {
    return Math.sqrt(x * x + y * y + z * z);
  }


  public static double max(double[] vals) {
    double max = vals[0];
    for (double v : vals) {
      if (v > max) {
        max = v;
      }
    }
    return max;
  }


  public static double min(double[] vals) {
    double min = vals[0];
    for (double v : vals) {
      if (v < min) {
        min = v;
      }
    }
    return min;
  }


  public static double range(double[] vals) {
    double min = vals[0];
    double max = vals[0];
    for (double v : vals) {
      if (v < min) {
        min = v;
      }
      if (v > max) {
        max = v;
      }
    }
    return max - min;
  }


  public static int length(double[] vals) {
    return vals.length;
  }


  public static double median(double[] vals) {
    return quantile(vals, 0.5);
  }


  public static double mean(double[] vals) {
    return sum(vals) / vals.length;
  }


  public static double stddev(double[] vals) {
    double mean = mean(vals);
    double squareSum = 0;
    for (double v : vals) {
      squareSum += v * v;
    }
    return Math.sqrt(squareSum / vals.length - mean * mean);
  }


  public static double stderr(double[] vals) {
//        return Math.sqrt(variance(vals) / vals.length);
    return stddev(vals) / Math.sqrt(vals.length);
  }


  public static double variance(double[] vals) {
    double stddev = stddev(vals);
    return stddev * stddev;
  }


  public static double meandev(double[] vals) {
    double mean = mean(vals);
    int size = vals.length;
    double sum = 0;
    for (int i = size; --i >= 0;) {
      sum += Math.abs(vals[i] - mean);
    }
    return sum / size;
  }


  public static double sum(double[] vals) {
    double sum = 0;
    for (double v : vals) {
      sum = sum + v;
    }
    return sum;
  }


  public static double product(double[] vals) {
    double prod = 1;
    for (double v : vals) {
      prod = prod * v;
    }
    return prod;
  }


  public static double quantile(double[] vals, double phi) {
    if (vals.length == 0) {
      return 0.0;
    }

    double[] sortedElements = Arrays.copyOf(vals, vals.length);
    Arrays.sort(sortedElements);
    int n = sortedElements.length;

    double index = phi * (n - 1);
    int lhs = (int) index;
    double delta = index - lhs;
    double result;

    if (lhs == n - 1) {
      result = sortedElements[lhs];
    } else {
      result = (1 - delta) * sortedElements[lhs] + delta * sortedElements[lhs + 1];
    }
    return result;
  }


  /**
   * Returns the lag-1 autocorrelation of a dataset;
   */
  public static double lag1(double[] vals) {
    double mean = mean(vals);
    int size = vals.length;
    double r1;
    double q = 0;
    double v = (vals[0] - mean) * (vals[0] - mean);
    for (int i = 1; i < size; i++) {
      double delta0 = (vals[i - 1] - mean);
      double delta1 = (vals[i] - mean);
      q += (delta0 * delta1 - q) / (i + 1);
      v += (delta1 * delta1 - v) / (i + 1);
    }
    r1 = q / v;
    return r1;
  }


  public static double norm_rmse(double[] obs, double[] sim, double missing) {
    double sum = 0;
    int size = 0;
    for (int i = 0; i < obs.length; i++) {
      if (obs[i] > missing) {
        sum += obs[i];
        size++;
      }
    }

    if (size == 0)
      throw new RuntimeException("No valid size value.");

    double measuredMean = sum / size;
    int N = Math.min(obs.length, sim.length);
    double numerator = 0, denominator = 0;
    for (int i = 0; i < N; i++) {
      if (obs[i] > missing) {
        numerator += (obs[i] - sim[i]) * (obs[i] - sim[i]);
        denominator += (obs[i] - measuredMean) * (obs[i] - measuredMean);
      }
    }
    if (denominator == 0) {
      throw new RuntimeException("Error: The denominator is 0.\n"
          + "This happens if all observed values are equal to their mean.");
    }
    return Math.sqrt(numerator / denominator);
  }


  public static double err_sum(double[] obs, double[] sim) {
    double volError = 0;
    for (int i = 0; i < sim.length; i++) {
      volError += (sim[i] - obs[i]);
    }
    return volError;
  }


  /**
   * Calcs coefficients of linear regression between x, y data
   *
   * @param xData the independent data array (x)
   * @param yData the dependent data array (y)
   * @return (intercept, gradient, r2)
   */
  public static double[] linearReg(double[] xData, double[] yData) {
    sameArrayLen(xData, yData);
    double sumX = 0;
    double sumY = 0;
    double prod = 0;
    int nstat = xData.length;
    double[] regCoef = new double[3]; //(intercept, gradient, r2)

    double meanYValue = mean(yData);
    double meanXValue = mean(xData);

    //calculating regression coefficients
    for (int i = 0; i < nstat; i++) {
      sumX += (xData[i] - meanXValue) * (xData[i] - meanXValue);
      sumY += (yData[i] - meanYValue) * (yData[i] - meanYValue);
      prod += (xData[i] - meanXValue) * (yData[i] - meanYValue);
    }
    if (sumX > 0 && sumY > 0) {
      regCoef[1] = prod / sumX;  //gradient
      regCoef[0] = meanYValue - regCoef[1] * meanXValue; //intercept
      regCoef[2] = Math.pow((prod / Math.sqrt(sumX * sumY)), 2); //r2
    }
    return regCoef;
  }


  public static double intercept(double[] xData, double[] yData) {
    return linearReg(xData, yData)[0];
  }


  public static double gradient(double[] xData, double[] yData) {
    return linearReg(xData, yData)[1];
  }


  public static double r2(double[] xData, double[] yData) {
    return linearReg(xData, yData)[2];
  }


  /**
   * Generate a random number in a range.
   *
   * @param min
   * @param max
   * @return the random value in the min/max range
   */
  public static double random(double min, double max) {
    assert max > min;
    return min + Math.random() * (max - min);
  }


  /**
   * Runoff coefficient error ROCE
   *
   * @param obs
   * @param sim
   * @param precip
   * @return roce
   */
  public static double runoffCoefficientError(double[] obs, double[] sim, double[] precip) {
    sameArrayLen(sim, obs, precip);
    double mean_pred = mean(sim);
    double mean_val = mean(obs);
    double mean_ppt = mean(precip);

    if (Double.compare(mean_ppt, 0.0) == 0)
      throw new RuntimeException("No valid mean_ppt value.");

    double error = Math.abs((mean_pred / mean_ppt) - (mean_val / mean_ppt));
    return Math.sqrt(error);
  }


  public static double stderrReg(double[] regcoeff, double[] obs, double[] sim) {
    sameArrayLen(obs, sim);
    double sum = 0;
    for (int i = 0; i < obs.length; i++) {
      double res = sim[i] * regcoeff[1] + regcoeff[0];
      sum += (obs[i] - res) * (obs[i] - res);
    }
    return Math.sqrt(sum / (obs.length - 2));
  }


  /**
   *
   * @param obs observed data
   * @param sim simulated data
   * @return dsGrad
   */
  public static double dsGrad(double[] obs, double[] sim) {
    sameArrayLen(obs, sim);
    int dsLength = sim.length;

    double[] cumPred = new double[dsLength];
    double[] cumVali = new double[dsLength];

    double cp = 0;
    double cv = 0;
    for (int i = 0; i < dsLength; i++) {
      cp += sim[i];
      cv += obs[i];
      cumPred[i] = cp;
      cumVali[i] = cv;
    }

    //interc., grad., r?
    double[] regCoef = linearReg(cumVali, cumPred);
    return regCoef[1];
  }


  /**
   * Check if the arrays have the same length
   *
   * @param arr
   */
  private static void sameArrayLen(double[]... arr) {
    int len = arr[0].length;
    for (double[] a : arr) {
      if (a.length != len) {
        throw new IllegalArgumentException("obs and sim data have not same size (" + a.length + "/" + len + ")");
      }
    }
  }

  /*
   *
   *
   *
   * simulated_array: one dimensional ndarray An array of simulated data from
   * the time series. observed_array: one dimensional ndarray An array of
   * observed data from the time series. replace_nan: float, optional If given,
   * indicates which value to replace NaN values with in the two arrays. If
   * None, when a NaN value is found at the i-th position in the observed OR
   * simulated array, the i-th value of the observed and simulated array are
   * removed before the computation. replace_inf: float, optional If given,
   * indicates which value to replace Inf values with in the two arrays. If
   * None, when an inf value is found at the i-th position in the observed OR
   * simulated array, the i-th value of the observed and simulated array are
   * removed before the computation. remove_neg: boolean, optional If True, when
   * a negative value is found at the i-th position in the observed OR simulated
   * array, the i-th value of the observed AND simulated array are removed
   * before the computation. remove_zero: boolean, optional If true, when a zero
   * value is found at the i-th position in the observed OR simulated array, the
   * i-th value of the observed AND simulated array are removed before the
   * computation.
   *
   *
   *
   * def treat_values(simulated_array, observed_array, replace_nan=None,
   * replace_inf=None, remove_neg=False, remove_zero=False): """Removes the nan,
   * negative, and inf values in two numpy arrays""" sim_copy =
   * np.copy(simulated_array) obs_copy = np.copy(observed_array)
   *
   * # Checking to see if the vectors are the same length assert sim_copy.ndim
   * == 1, "The simulated array is not one dimensional." assert obs_copy.ndim ==
   * 1, "The observed array is not one dimensional."
   *
   * if sim_copy.size != obs_copy.size: raise RuntimeError("The two ndarrays are
   * not the same size.")
   *
   * # Treat missing data in observed_array and simulated_array, rows in
   * simulated_array or # observed_array that contain nan values
   * all_treatment_array = np.ones(obs_copy.size, dtype=bool)
   *
   * if np.any(np.isnan(obs_copy)) or np.any(np.isnan(sim_copy)): if replace_nan
   * is not None: # Finding the NaNs sim_nan = np.isnan(sim_copy) obs_nan =
   * np.isnan(obs_copy) # Replacing the NaNs with the input sim_copy[sim_nan] =
   * replace_nan obs_copy[obs_nan] = replace_nan
   *
   * warnings.warn("Elements(s) {} contained NaN values in the simulated array
   * and " "elements(s) {} contained NaN values in the observed array and have
   * been " "replaced (Elements are zero indexed).".format(np.where(sim_nan)[0],
   * np.where(obs_nan)[0]), UserWarning) else: # Getting the indices of the nan
   * values, combining them, and informing user. nan_indices_fcst =
   * ~np.isnan(sim_copy) nan_indices_obs = ~np.isnan(obs_copy) all_nan_indices =
   * np.logical_and(nan_indices_fcst, nan_indices_obs) all_treatment_array =
   * np.logical_and(all_treatment_array, all_nan_indices)
   *
   * warnings.warn("Row(s) {} contained NaN values and the row(s) have been "
   * "removed (Rows are zero indexed).".format(np.where(~all_nan_indices)[0]),
   * UserWarning)
   *
   * if np.any(np.isinf(obs_copy)) or np.any(np.isinf(sim_copy)): if replace_nan
   * is not None: # Finding the NaNs sim_inf = np.isinf(sim_copy) obs_inf =
   * np.isinf(obs_copy) # Replacing the NaNs with the input sim_copy[sim_inf] =
   * replace_inf obs_copy[obs_inf] = replace_inf
   *
   * warnings.warn("Elements(s) {} contained Inf values in the simulated array
   * and " "elements(s) {} contained Inf values in the observed array and have
   * been " "replaced (Elements are zero indexed).".format(np.where(sim_inf)[0],
   * np.where(obs_inf)[0]), UserWarning) else: inf_indices_fcst =
   * ~(np.isinf(sim_copy)) inf_indices_obs = ~np.isinf(obs_copy) all_inf_indices
   * = np.logical_and(inf_indices_fcst, inf_indices_obs) all_treatment_array =
   * np.logical_and(all_treatment_array, all_inf_indices)
   *
   * warnings.warn( "Row(s) {} contained Inf or -Inf values and the row(s) have
   * been removed (Rows " "are zero
   * indexed).".format(np.where(~all_inf_indices)[0]), UserWarning )
   *
   * # Treat zero data in observed_array and simulated_array, rows in
   * simulated_array or # observed_array that contain zero values if
   * remove_zero: if (obs_copy == 0).any() or (sim_copy == 0).any():
   * zero_indices_fcst = ~(sim_copy == 0) zero_indices_obs = ~(obs_copy == 0)
   * all_zero_indices = np.logical_and(zero_indices_fcst, zero_indices_obs)
   * all_treatment_array = np.logical_and(all_treatment_array, all_zero_indices)
   *
   * warnings.warn( "Row(s) {} contained zero values and the row(s) have been
   * removed (Rows are " "zero
   * indexed).".format(np.where(~all_zero_indices)[0]), UserWarning )
   *
   * # Treat negative data in observed_array and simulated_array, rows in
   * simulated_array or # observed_array that contain negative values
   *
   * # Ignore runtime warnings from comparing if remove_neg: with
   * np.errstate(invalid='ignore'): obs_copy_bool = obs_copy < 0 sim_copy_bool =
   * sim_copy < 0
   *
   * if obs_copy_bool.any() or sim_copy_bool.any(): neg_indices_fcst =
   * ~sim_copy_bool neg_indices_obs = ~obs_copy_bool all_neg_indices =
   * np.logical_and(neg_indices_fcst, neg_indices_obs) all_treatment_array =
   * np.logical_and(all_treatment_array, all_neg_indices)
   *
   * warnings.warn("Row(s) {} contained negative values and the row(s) have been
   * " "removed (Rows are zero indexed).".format(np.where(~all_neg_indices)[0]),
   * UserWarning)
   *
   * obs_copy = obs_copy[all_treatment_array] sim_copy =
   * sim_copy[all_treatment_array]
   *
   * return sim_copy, obs_copy
   */
}