MongoAggregations.java [src/java/utils] Revision: default Date:
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package utils;
import static com.mongodb.client.model.Accumulators.push;
import static com.mongodb.client.model.Aggregates.group;
import static com.mongodb.client.model.Aggregates.match;
import static com.mongodb.client.model.Aggregates.out;
import static com.mongodb.client.model.Aggregates.project;
import static com.mongodb.client.model.Aggregates.unwind;
import static com.mongodb.client.model.Projections.computed;
import static com.mongodb.client.model.Projections.excludeId;
import static com.mongodb.client.model.Projections.fields;
import static com.mongodb.client.model.Projections.include;
import static java.util.Arrays.asList;
import java.util.Date;
import java.util.List;
import org.bson.Document;
import org.bson.conversions.Bson;
/**
* All MongoAggregations are here,
*
* @author od
*/
class MongoAggregations {
/**
*
* @param to_collection
* @return
*/
static List<Bson> computeMinMax(String to_collection) {
return asList(
project(fields(
excludeId(),
include("values", "metadata", "timestamp"),
computed("metadata", new Document("name", "$metadata.name")
.append("type", "$metadata.type")
.append("norm", "$metadata.norm")
.append("norm_min", "$metadata.norm_min")
.append("norm_max", "$metadata.norm_max")
.append("min", new Document("$min", "$values"))
.append("max", new Document("$max", "$values"))
.append("values_id", "$metadata.values_id")
)
)),
out(to_collection)
);
}
/*
db.getCollection('raw').aggregate([
{ $match : { //match a two datasets
"metadata.type": "in"
}
},{
$project: {
_id:1,
name:"$metadata.name"
}
}])
*/
/**
*
* @param to_collection
* @param in_or_out
* @return
*/
static List<Bson> getVarNames(String to_collection, String in_or_out) {
return asList(
match(new Document("metadata.type", in_or_out)),
project(fields(
excludeId(),
computed("name", "$metadata.name"))
),
out(to_collection)
);
}
/*
normalize:
var a = 2.6
var aname = "erosion"
db.raw.aggregate(
[
{ $match : {name : aname}
},
{ $project : {
name: 1,
values: 1,
type: 1,
normalized: 1,
count: { $size: "$values" },
min: { $min : "$values"},
max: { $max : "$values" },
range: { $subtract: [ { $max : "$values" }, { $min : "$values"}]}
}
},
{ $unwind : "$values"
},
{ $project:
{ name: 1,
count : 1,
min : 1,
max : 1,
type : 1,
// norm: { $multiply: [ "$values", a] },
norm: { $cond: [ "$normalized", "$values",{$divide: [ {$subtract: ["$values", "$min" ]}, "$range"] }] } }
},
{ $group:
{
_id: { name : "$name" ,
count: "$count",
min: "$min",
max: "$max",
type: "$type"
},
values: { $push: "$norm" }
}
},
{ $project:
{
_id : 0,
name : "$_id.name",
count : "$_id.count",
min : "$_id.min",
max : "$_id.max",
type : "$_id.type",
min_index: { $indexOfArray : [ "$values", 0.0 ] },
max_index: { $indexOfArray : [ "$values", 1.0 ] },
date : new Date(),
values: "$values"
}
}
// { $out: "normalized" }
]
)
get some statistics:
db.raw.aggregate( [
{ $project: {
_id: "$name" ,
count: { $size: "$values" },
min: { $min : "$values"},
max: { $max : "$values" },
mean: { $avg : "$values" },
stdev: {$stdDevPop : "$values"},
range: { $subtract: [ { $max : "$values" }, { $min : "$values"}]},
min_index: { $indexOfArray : [ "$values", { $min : "$values"} ] },
max_index: { $indexOfArray : [ "$values", { $max : "$values"} ] }
}
}
])
*/
/**
* TODO: replace min and max with metadata.min and metadata.max No need of
* assigning them
* @param to_collection
* @return
*/
static List<Bson> normalize(String to_collection) {
return asList(
project(fields(
excludeId(),
include("values", "metadata.name", "metadata.type",
"metadata.norm", "metadata.norm_min", "metadata.norm_max",
"metadata.min", "metadata.max", "metadata.values_id"),
computed("count", new Document("$size", "$values")),
computed("min", "$metadata.min"),
computed("max", "$metadata.max"),
computed("range", new Document("$subtract",
asList("$metadata.max", "$metadata.min"))))),
unwind("$values"),
project(fields(
excludeId(),
include("values", "min", "max", "count", "metadata.name",
"metadata.type", "metadata.norm", "metadata.norm_min",
"metadata.norm_max", "metadata.values_id"),
computed("normVals", new Document(
"$cond", asList("$metadata.norm", "$values", new Document(
"$sum", asList(new Document(
"$divide", asList(new Document(
"$multiply", asList(new Document(
"$subtract", asList("$values", "$min")),
new Document("$subtract", asList(
"$metadata.norm_max", "$metadata.norm_min"))
)
), "$range")
), "$metadata.norm_min")
))
)
))),
group(new Document("name", "$metadata.name")
.append("type", "$metadata.type")
.append("norm", "$metadata.norm")
.append("norm_min", "$metadata.norm_min")
.append("norm_max", "$metadata.norm_max")
.append("values_id", "$metadata.values_id")
.append("count", "$count")
.append("min", "$min")
.append("max", "$max"), push("values", "$normVals")),
project(fields(
excludeId(),
computed("timestamp", new Date()),
computed("metadata", new Document("name", "$_id.name")
.append("type", "$_id.type")
.append("norm", "$_id.norm")
.append("norm_min", "$_id.norm_min")
.append("norm_max", "$_id.norm_max")
.append("values_id", "$_id.values_id")
.append("count", "$_id.count")
.append("min", "$_id.min")
.append("max", "$_id.max")
.append("min_index", new Document("$indexOfArray", asList("$values", "$_id.norm_min")))
.append("max_index", new Document("$indexOfArray", asList("$values", "$_id.norm_max")))),
computed("values", "$values"))
),
out(to_collection)
);
}
/*
db.getCollection('raw').aggregate([
// { $match: { //match a specific dataset
// _id : ObjectId("5e3104a73a4ae21d69da4420")
// }
// },
{ $project: { // project only the values
"values" : 1,
"_id" : 1,
"name" : "$metadata.name"
}
},{ $unwind: '$values' // unwind to be able to sort
},{ $sort : { // sort the values
values : 1
}
},{ $group: { // recreate the array
_id: "$_id",
vals : { $push: "$values" },
name : { $first: "$name" }
}
},{ $project: { // find missing values (-9999) by filtering
vals : 1,
"name":1,
v: {
$filter: {
input: "$vals",
cond: { $eq: [ "$$this", -9999] }
}
}
}
},{ $project: { // count the missing value, and all values
vals : 1,
"name":1,
count_missing: { $size: "$v" },
count: { $size: "$vals" }
}
},{ $project: { // compute percentile indices
count : 1,
vals : 1,
count_missing : 1,
"name":1,
p25Index : { $toInt : { $divide: [ { $multiply: [ "$count", 25 ]}, 100] } },
p50Index : { $toInt : { $divide: [ { $multiply: [ "$count", 50 ]}, 100] } },
p75Index : { $toInt : { $divide: [ { $multiply: [ "$count", 75 ]}, 100] } },
}
},{ $set: { // compute the percentiles
min : { $min: "$vals" },
p25 : { $arrayElemAt: [ "$vals", "$p25Index" ] },
p50 : { $arrayElemAt: [ "$vals", "$p50Index" ] },
p75 : { $arrayElemAt: [ "$vals", "$p75Index" ] },
max : { $max: "$vals" }
}
},{ $set: { // compute
irq: { $subtract: ["$p75", "$p25"] },
mean: { $avg: "$vals" },
stdDev: { $stdDevPop: "$vals" }
}
},{ $set: {
of_min: { $subtract: [ "$p25" , { $multiply: [ "$irq", 3.0] }] },
if_min: { $subtract: [ "$p25" , { $multiply: [ "$irq", 1.5] }] },
if_max: { $add: [ "$p75" , { $multiply: [ "$irq", 1.5] }] },
of_max: { $add: [ "$p75" , { $multiply: [ "$irq", 3.0] }] }
}
},{ $project: { // outliers by filtering
name:1,
count : 1,
vals : 1,
if_min : 1,
if_max : 1,
of_min : 1,
of_max : 1,
min: 1,
p25:1,
p50:1,
p75:1,
max: 1,
irq:1,
mean:1,
stdDev:1,
count_missing:1,
minor_outliers: {
$filter: {
input: "$vals",
cond: { $or: [
{ $and: [ { $lt: [ "$$this", "$if_min"] }, { $gt: [ "$$this", "$of_min"] } ] } ,
{ $and: [ { $lt: [ "$$this", "$of_max"] }, { $gt: [ "$$this", "$if_max"] } ] }
]}
}
},
major_outliers: {
$filter: {
input: "$vals",
cond: { $or: [ { $lt: [ "$$this", "$of_min"] }, { $gt: [ "$$this", "$of_max"] } ] }
}
}
}
},{
$project: { // remove not needed fields
p25Index : 0,
p50Index : 0,
p75Index : 0,
// if_min : 0,
// if_max : 0,
// of_min : 0,
// of_max : 0,
vals: 0,
}
}], { allowDiskUse: true } )
*/
static List<Bson> raw_stats(String to_collection) {
// TODO create Java Aggregate
return null;
}
/*
db.getCollection('raw').aggregate([
{ $match : { //match a two datasets
$or: [ {"metadata.name": "pd"}, {"metadata.name": "da"} ]
}
},
{ $project: {
values : 1,
_id : 1,
mean: { $avg: "$values"},
sd: { $stdDevPop: "$values" },
n: { $size: "$values"},
name : "$metadata.name"
}
},{
$group: {
_id: 0,
n: {$last: "$n"},
mean_x: {$last: "$mean"},
mean_y: {$first: "$mean"},
sd_x: { $last: "$sd" },
sd_y: { $first: "$sd" },
values_x: {$last: "$values"},
values_y: {$first: "$values"},
name_x: {$last: "$name"},
name_y: {$first: "$name"},
}
},{
$project: {
values_x:1,
values_y:1,
mean_x:1,
mean_y:1,
sd_x:1,
sd_y:1,
name_x:1,
name_y:1,
n:1,
dp: {
$reduce: {
input: { $range: [ 0, "$n"] },
initialValue: 0,
in: { $add: [ "$$value", { $multiply: [ { $arrayElemAt: [ "$values_x", "$$this" ] }, { $arrayElemAt: [ "$values_y", "$$this" ] } ] } ] }
}
},
// x_min_mean: {
// $map: { input: "$values_x", as: "x", in: { $subtract: [ "$$x", "$mean_x"] }}
// },
// y_min_mean: {
// $map: { input: "$values_y", as: "y", in: { $subtract: [ "$$y", "$mean_y"] }}
// }
//
}
},{
$set: {
na: { $concat: [ "$name_x", "-", "$name_y"] },
r: { $divide: [
{ $subtract: [ "$dp", { $multiply: ["$n", "$mean_x", "$mean_y"] }]},
{ $multiply: [ "$sd_x", "$sd_y", { $subtract: [ "$n" , 1 ]}] }
]
}
}
},{
$group: {
_id:0,
pearson_correlation: { $addToSet: { vars: "$na", r: "$r"} }
}
}], { allowDiskUse: true })
*/
static List<Bson> pearson(String to_collection, String var1, String var2) {
// TODO create Java Aggregate
return null;
}
}