Statistics.java [src/csip/cosu] Revision: Date:
/*
* $Id$
*
* This file is part of the Cloud Services Integration Platform (CSIP),
* a Model-as-a-Service framework, API and application suite.
*
* 2012-2022, Olaf David and others, OMSLab, Colorado State University.
*
* OMSLab licenses this file to you under the MIT license.
* See the LICENSE file in the project root for more information.
*/
package csip.cosu;
import java.util.Arrays;
/**
* Statistics basics.
*
* @author od
*/
public class Statistics {
public static final int MAXIMIZATION = 1;
public static final int MINIMIZATION = 2;
public static final int ABSMAXIMIZATION = 3;
public static final int ABSMINIMIZATION = 4;
private Statistics() {
}
/**
* Normalized Vector.
*/
public static double norm_vec(double x, double y, double z) {
return Math.sqrt(x * x + y * y + z * z);
}
public static double max(double[] vals) {
double max = vals[0];
for (double v : vals) {
if (v > max) {
max = v;
}
}
return max;
}
public static double min(double[] vals) {
double min = vals[0];
for (double v : vals) {
if (v < min) {
min = v;
}
}
return min;
}
public static double range(double[] vals) {
double min = vals[0];
double max = vals[0];
for (double v : vals) {
if (v < min) {
min = v;
}
if (v > max) {
max = v;
}
}
return max - min;
}
public static int length(double[] vals) {
return vals.length;
}
public static double median(double[] vals) {
return quantile(vals, 0.5);
}
public static double mean(double[] vals) {
return sum(vals) / vals.length;
}
public static double stddev(double[] vals) {
double mean = mean(vals);
double squareSum = 0;
for (double v : vals) {
squareSum += v * v;
}
return Math.sqrt(squareSum / vals.length - mean * mean);
}
public static double stderr(double[] vals) {
// return Math.sqrt(variance(vals) / vals.length);
return stddev(vals) / Math.sqrt(vals.length);
}
public static double variance(double[] vals) {
double stddev = stddev(vals);
return stddev * stddev;
}
public static double meandev(double[] vals) {
double mean = mean(vals);
int size = vals.length;
double sum = 0;
for (int i = size; --i >= 0;) {
sum += Math.abs(vals[i] - mean);
}
return sum / size;
}
public static double sum(double[] vals) {
double sum = 0;
for (double v : vals) {
sum = sum + v;
}
return sum;
}
public static double product(double[] vals) {
double prod = 1;
for (double v : vals) {
prod = prod * v;
}
return prod;
}
public static double quantile(double[] vals, double phi) {
if (vals.length == 0) {
return 0.0;
}
double[] sortedElements = Arrays.copyOf(vals, vals.length);
Arrays.sort(sortedElements);
int n = sortedElements.length;
double index = phi * (n - 1);
int lhs = (int) index;
double delta = index - lhs;
double result;
if (lhs == n - 1) {
result = sortedElements[lhs];
} else {
result = (1 - delta) * sortedElements[lhs] + delta * sortedElements[lhs + 1];
}
return result;
}
/**
* Returns the lag-1 autocorrelation of a dataset;
*/
public static double lag1(double[] vals) {
double mean = mean(vals);
int size = vals.length;
double r1;
double q = 0;
double v = (vals[0] - mean) * (vals[0] - mean);
for (int i = 1; i < size; i++) {
double delta0 = (vals[i - 1] - mean);
double delta1 = (vals[i] - mean);
q += (delta0 * delta1 - q) / (i + 1);
v += (delta1 * delta1 - v) / (i + 1);
}
r1 = q / v;
return r1;
}
public static double norm_rmse(double[] obs, double[] sim, double missing) {
double sum = 0;
int size = 0;
for (int i = 0; i < obs.length; i++) {
if (obs[i] > missing) {
sum += obs[i];
size++;
}
}
if (size == 0)
throw new RuntimeException("No valid size value.");
double measuredMean = sum / size;
int N = Math.min(obs.length, sim.length);
double numerator = 0, denominator = 0;
for (int i = 0; i < N; i++) {
if (obs[i] > missing) {
numerator += (obs[i] - sim[i]) * (obs[i] - sim[i]);
denominator += (obs[i] - measuredMean) * (obs[i] - measuredMean);
}
}
if (denominator == 0) {
throw new RuntimeException("Error: The denominator is 0.\n"
+ "This happens if all observed values are equal to their mean.");
}
return Math.sqrt(numerator / denominator);
}
public static double err_sum(double[] obs, double[] sim) {
double volError = 0;
for (int i = 0; i < sim.length; i++) {
volError += (sim[i] - obs[i]);
}
return volError;
}
/**
* Calcs coefficients of linear regression between x, y data
*
* @param xData the independent data array (x)
* @param yData the dependent data array (y)
* @return (intercept, gradient, r2)
*/
public static double[] linearReg(double[] xData, double[] yData) {
sameArrayLen(xData, yData);
double sumX = 0;
double sumY = 0;
double prod = 0;
int nstat = xData.length;
double[] regCoef = new double[3]; //(intercept, gradient, r2)
double meanYValue = mean(yData);
double meanXValue = mean(xData);
//calculating regression coefficients
for (int i = 0; i < nstat; i++) {
sumX += (xData[i] - meanXValue) * (xData[i] - meanXValue);
sumY += (yData[i] - meanYValue) * (yData[i] - meanYValue);
prod += (xData[i] - meanXValue) * (yData[i] - meanYValue);
}
if (sumX > 0 && sumY > 0) {
regCoef[1] = prod / sumX; //gradient
regCoef[0] = meanYValue - regCoef[1] * meanXValue; //intercept
regCoef[2] = Math.pow((prod / Math.sqrt(sumX * sumY)), 2); //r2
}
return regCoef;
}
public static double intercept(double[] xData, double[] yData) {
return linearReg(xData, yData)[0];
}
public static double gradient(double[] xData, double[] yData) {
return linearReg(xData, yData)[1];
}
public static double r2(double[] xData, double[] yData) {
return linearReg(xData, yData)[2];
}
/**
* Generate a random number in a range.
*
* @param min
* @param max
* @return the random value in the min/max range
*/
public static double random(double min, double max) {
assert max > min;
return min + Math.random() * (max - min);
}
/**
* Runoff coefficient error ROCE
*
* @param obs
* @param sim
* @param precip
* @return roce
*/
public static double runoffCoefficientError(double[] obs, double[] sim, double[] precip) {
sameArrayLen(sim, obs, precip);
double mean_pred = mean(sim);
double mean_val = mean(obs);
double mean_ppt = mean(precip);
if (Double.compare(mean_ppt, 0.0) == 0)
throw new RuntimeException("No valid mean_ppt value.");
double error = Math.abs((mean_pred / mean_ppt) - (mean_val / mean_ppt));
return Math.sqrt(error);
}
public static double stderrReg(double[] regcoeff, double[] obs, double[] sim) {
sameArrayLen(obs, sim);
double sum = 0;
for (int i = 0; i < obs.length; i++) {
double res = sim[i] * regcoeff[1] + regcoeff[0];
sum += (obs[i] - res) * (obs[i] - res);
}
return Math.sqrt(sum / (obs.length - 2));
}
/**
*
* @param obs observed data
* @param sim simulated data
* @return dsGrad
*/
public static double dsGrad(double[] obs, double[] sim) {
sameArrayLen(obs, sim);
int dsLength = sim.length;
double[] cumPred = new double[dsLength];
double[] cumVali = new double[dsLength];
double cp = 0;
double cv = 0;
for (int i = 0; i < dsLength; i++) {
cp += sim[i];
cv += obs[i];
cumPred[i] = cp;
cumVali[i] = cv;
}
//interc., grad., r?
double[] regCoef = linearReg(cumVali, cumPred);
return regCoef[1];
}
/**
* Check if the arrays have the same length
*
* @param arr
*/
private static void sameArrayLen(double[]... arr) {
int len = arr[0].length;
for (double[] a : arr) {
if (a.length != len) {
throw new IllegalArgumentException("obs and sim data have not same size (" + a.length + "/" + len + ")");
}
}
}
/*
*
*
*
* simulated_array: one dimensional ndarray An array of simulated data from
* the time series. observed_array: one dimensional ndarray An array of
* observed data from the time series. replace_nan: float, optional If given,
* indicates which value to replace NaN values with in the two arrays. If
* None, when a NaN value is found at the i-th position in the observed OR
* simulated array, the i-th value of the observed and simulated array are
* removed before the computation. replace_inf: float, optional If given,
* indicates which value to replace Inf values with in the two arrays. If
* None, when an inf value is found at the i-th position in the observed OR
* simulated array, the i-th value of the observed and simulated array are
* removed before the computation. remove_neg: boolean, optional If True, when
* a negative value is found at the i-th position in the observed OR simulated
* array, the i-th value of the observed AND simulated array are removed
* before the computation. remove_zero: boolean, optional If true, when a zero
* value is found at the i-th position in the observed OR simulated array, the
* i-th value of the observed AND simulated array are removed before the
* computation.
*
*
*
* def treat_values(simulated_array, observed_array, replace_nan=None,
* replace_inf=None, remove_neg=False, remove_zero=False): """Removes the nan,
* negative, and inf values in two numpy arrays""" sim_copy =
* np.copy(simulated_array) obs_copy = np.copy(observed_array)
*
* # Checking to see if the vectors are the same length assert sim_copy.ndim
* == 1, "The simulated array is not one dimensional." assert obs_copy.ndim ==
* 1, "The observed array is not one dimensional."
*
* if sim_copy.size != obs_copy.size: raise RuntimeError("The two ndarrays are
* not the same size.")
*
* # Treat missing data in observed_array and simulated_array, rows in
* simulated_array or # observed_array that contain nan values
* all_treatment_array = np.ones(obs_copy.size, dtype=bool)
*
* if np.any(np.isnan(obs_copy)) or np.any(np.isnan(sim_copy)): if replace_nan
* is not None: # Finding the NaNs sim_nan = np.isnan(sim_copy) obs_nan =
* np.isnan(obs_copy) # Replacing the NaNs with the input sim_copy[sim_nan] =
* replace_nan obs_copy[obs_nan] = replace_nan
*
* warnings.warn("Elements(s) {} contained NaN values in the simulated array
* and " "elements(s) {} contained NaN values in the observed array and have
* been " "replaced (Elements are zero indexed).".format(np.where(sim_nan)[0],
* np.where(obs_nan)[0]), UserWarning) else: # Getting the indices of the nan
* values, combining them, and informing user. nan_indices_fcst =
* ~np.isnan(sim_copy) nan_indices_obs = ~np.isnan(obs_copy) all_nan_indices =
* np.logical_and(nan_indices_fcst, nan_indices_obs) all_treatment_array =
* np.logical_and(all_treatment_array, all_nan_indices)
*
* warnings.warn("Row(s) {} contained NaN values and the row(s) have been "
* "removed (Rows are zero indexed).".format(np.where(~all_nan_indices)[0]),
* UserWarning)
*
* if np.any(np.isinf(obs_copy)) or np.any(np.isinf(sim_copy)): if replace_nan
* is not None: # Finding the NaNs sim_inf = np.isinf(sim_copy) obs_inf =
* np.isinf(obs_copy) # Replacing the NaNs with the input sim_copy[sim_inf] =
* replace_inf obs_copy[obs_inf] = replace_inf
*
* warnings.warn("Elements(s) {} contained Inf values in the simulated array
* and " "elements(s) {} contained Inf values in the observed array and have
* been " "replaced (Elements are zero indexed).".format(np.where(sim_inf)[0],
* np.where(obs_inf)[0]), UserWarning) else: inf_indices_fcst =
* ~(np.isinf(sim_copy)) inf_indices_obs = ~np.isinf(obs_copy) all_inf_indices
* = np.logical_and(inf_indices_fcst, inf_indices_obs) all_treatment_array =
* np.logical_and(all_treatment_array, all_inf_indices)
*
* warnings.warn( "Row(s) {} contained Inf or -Inf values and the row(s) have
* been removed (Rows " "are zero
* indexed).".format(np.where(~all_inf_indices)[0]), UserWarning )
*
* # Treat zero data in observed_array and simulated_array, rows in
* simulated_array or # observed_array that contain zero values if
* remove_zero: if (obs_copy == 0).any() or (sim_copy == 0).any():
* zero_indices_fcst = ~(sim_copy == 0) zero_indices_obs = ~(obs_copy == 0)
* all_zero_indices = np.logical_and(zero_indices_fcst, zero_indices_obs)
* all_treatment_array = np.logical_and(all_treatment_array, all_zero_indices)
*
* warnings.warn( "Row(s) {} contained zero values and the row(s) have been
* removed (Rows are " "zero
* indexed).".format(np.where(~all_zero_indices)[0]), UserWarning )
*
* # Treat negative data in observed_array and simulated_array, rows in
* simulated_array or # observed_array that contain negative values
*
* # Ignore runtime warnings from comparing if remove_neg: with
* np.errstate(invalid='ignore'): obs_copy_bool = obs_copy < 0 sim_copy_bool =
* sim_copy < 0
*
* if obs_copy_bool.any() or sim_copy_bool.any(): neg_indices_fcst =
* ~sim_copy_bool neg_indices_obs = ~obs_copy_bool all_neg_indices =
* np.logical_and(neg_indices_fcst, neg_indices_obs) all_treatment_array =
* np.logical_and(all_treatment_array, all_neg_indices)
*
* warnings.warn("Row(s) {} contained negative values and the row(s) have been
* " "removed (Rows are zero indexed).".format(np.where(~all_neg_indices)[0]),
* UserWarning)
*
* obs_copy = obs_copy[all_treatment_array] sim_copy =
* sim_copy[all_treatment_array]
*
* return sim_copy, obs_copy
*/
}