MongoAggregations.java [src/java/utils] Revision: default  Date:
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package utils;

import static com.mongodb.client.model.Accumulators.push;
import static com.mongodb.client.model.Aggregates.group;
import static com.mongodb.client.model.Aggregates.match;
import static com.mongodb.client.model.Aggregates.out;
import static com.mongodb.client.model.Aggregates.project;
import static com.mongodb.client.model.Aggregates.unwind;
import static com.mongodb.client.model.Projections.computed;
import static com.mongodb.client.model.Projections.excludeId;
import static com.mongodb.client.model.Projections.fields;
import static com.mongodb.client.model.Projections.include;
import static java.util.Arrays.asList;
import java.util.Date;
import java.util.List;
import org.bson.Document;
import org.bson.conversions.Bson;

/**
 * All MongoAggregations are here,
 *
 * @author od
 */
class MongoAggregations {

  /**
   *
   * @param to_collection
   * @return
   */
  static List<Bson> computeMinMax(String to_collection) {
    return asList(
        project(fields(
            excludeId(),
            include("values", "metadata", "timestamp"),
            computed("metadata", new Document("name", "$metadata.name")
                .append("type", "$metadata.type")
                .append("norm", "$metadata.norm")
                .append("norm_min", "$metadata.norm_min")
                .append("norm_max", "$metadata.norm_max")
                .append("min", new Document("$min", "$values"))
                .append("max", new Document("$max", "$values"))
                .append("values_id", "$metadata.values_id")
            )
        )),
        out(to_collection)
    );
  }


  /*
  db.getCollection('raw').aggregate([
 {   $match : { //match a two datasets
       "metadata.type": "in" 
     }
 },{
     $project: {
         _id:1,
         name:"$metadata.name"
     }
 }])
   */
  /**
   *
   * @param to_collection
   * @param in_or_out
   * @return
   */
  static List<Bson> getVarNames(String to_collection, String in_or_out) {
    return asList(
        match(new Document("metadata.type", in_or_out)),
        project(fields(
            excludeId(),
            computed("name", "$metadata.name"))
        ),
        out(to_collection)
    );
  }


  /*
normalize:
var a = 2.6
var aname = "erosion"

db.raw.aggregate(
 [
      { $match : {name : aname}
      },
      { $project : {
            name: 1,
            values: 1,
            type: 1,
            normalized: 1,
        count: { $size: "$values" }, 
        min: { $min : "$values"}, 
        max: { $max : "$values" },
        range: { $subtract: [ { $max : "$values" }, { $min : "$values"}]}
        } 
        
      },
      { $unwind : "$values" 
      },
      { $project: 
         { name: 1,    
           count : 1,     
           min : 1,
           max : 1,
           type : 1,
//            norm: { $multiply: [ "$values", a] }, 
           norm: { $cond: [ "$normalized", "$values",{$divide: [ {$subtract: ["$values", "$min" ]}, "$range"] }] } }
      },
      { $group:
         {           
            _id: { name : "$name" , 
                   count: "$count",  
                   min: "$min",
                   max: "$max",
                   type: "$type"
                },        
            values: { $push: "$norm" }
         }
      },
      { $project: 
          { 
             _id : 0,            
             name : "$_id.name",
             count : "$_id.count",
             min : "$_id.min",
             max : "$_id.max",
             type : "$_id.type",  
             min_index: { $indexOfArray : [ "$values", 0.0 ] },
             max_index: { $indexOfArray : [ "$values", 1.0 ] },
             date : new Date(), 
             values: "$values"
          } 
      }
//       { $out: "normalized"  }      
    ]
) 
  
  get some statistics:
 db.raw.aggregate( [
   { $project: { 
       _id: "$name" , 
       count: { $size: "$values" }, 
       min: { $min : "$values"}, 
       max: { $max : "$values" }, 
       mean: { $avg : "$values" }, 
       stdev: {$stdDevPop : "$values"},
       range: { $subtract: [ { $max : "$values" }, { $min : "$values"}]},
       min_index: { $indexOfArray : [ "$values", { $min : "$values"} ] },
       max_index: { $indexOfArray : [ "$values", { $max : "$values"} ] }
       } 
    }
 ])

  
   */
  /**
   * TODO: replace min and max with metadata.min and metadata.max No need of
   * assigning them
   * @param to_collection
   * @return
   */
  static List<Bson> normalize(String to_collection) {
    return asList(
        project(fields(
            excludeId(),
            include("values", "metadata.name", "metadata.type",
                "metadata.norm", "metadata.norm_min", "metadata.norm_max",
                "metadata.min", "metadata.max", "metadata.values_id"),
            computed("count", new Document("$size", "$values")),
            computed("min", "$metadata.min"),
            computed("max", "$metadata.max"),
            computed("range", new Document("$subtract",
                asList("$metadata.max", "$metadata.min"))))),
        unwind("$values"),
        project(fields(
            excludeId(),
            include("values", "min", "max", "count", "metadata.name",
                "metadata.type", "metadata.norm", "metadata.norm_min",
                "metadata.norm_max", "metadata.values_id"),
            computed("normVals", new Document(
                "$cond", asList("$metadata.norm", "$values", new Document(
                    "$sum", asList(new Document(
                        "$divide", asList(new Document(
                            "$multiply", asList(new Document(
                                "$subtract", asList("$values", "$min")),
                                new Document("$subtract", asList(
                                    "$metadata.norm_max", "$metadata.norm_min"))
                            )
                        ), "$range")
                    ), "$metadata.norm_min")
                ))
            )
            ))),
        group(new Document("name", "$metadata.name")
            .append("type", "$metadata.type")
            .append("norm", "$metadata.norm")
            .append("norm_min", "$metadata.norm_min")
            .append("norm_max", "$metadata.norm_max")
            .append("values_id", "$metadata.values_id")
            .append("count", "$count")
            .append("min", "$min")
            .append("max", "$max"), push("values", "$normVals")),
        project(fields(
            excludeId(),
            computed("timestamp", new Date()),
            computed("metadata", new Document("name", "$_id.name")
                .append("type", "$_id.type")
                .append("norm", "$_id.norm")
                .append("norm_min", "$_id.norm_min")
                .append("norm_max", "$_id.norm_max")
                .append("values_id", "$_id.values_id")
                .append("count", "$_id.count")
                .append("min", "$_id.min")
                .append("max", "$_id.max")
                .append("min_index", new Document("$indexOfArray", asList("$values", "$_id.norm_min")))
                .append("max_index", new Document("$indexOfArray", asList("$values", "$_id.norm_max")))),
            computed("values", "$values"))
        ),
        out(to_collection)
    );
  }


  /*
  db.getCollection('raw').aggregate([
//  {   $match: { //match a specific dataset
//        _id : ObjectId("5e3104a73a4ae21d69da4420")
//      }
//  },
 { $project: {   // project only the values        
        "values" : 1,
        "_id" : 1,
        "name" : "$metadata.name"
      }
 },{ $unwind: '$values'  // unwind to be able to sort
 },{ $sort : {  // sort the values
       values : 1  
     } 
 },{ $group: { // recreate the array
       _id: "$_id",
       vals : { $push: "$values" },
       name : { $first: "$name" }
     }
  },{ $project: {   // find missing values (-9999) by filtering
        vals : 1,
        "name":1,
        v: {
            $filter: {
               input: "$vals",
               cond: { $eq: [ "$$this", -9999] }
            }
         }
      }
  },{ $project: {  // count the missing value, and all values          
        vals : 1,
      "name":1,
        count_missing: { $size: "$v" },
        count: { $size: "$vals" }
    }
  },{ $project: { // compute percentile indices
        count : 1,   
        vals : 1,
        count_missing : 1,
      "name":1,
        p25Index : { $toInt : { $divide: [ { $multiply: [ "$count", 25 ]}, 100] } },
        p50Index : { $toInt : { $divide: [ { $multiply: [ "$count", 50 ]}, 100] } },
        p75Index : { $toInt : { $divide: [ { $multiply: [ "$count", 75 ]}, 100] } }, 
     }
 },{ $set: {  // compute the percentiles
        min : { $min: "$vals" },
        p25 : { $arrayElemAt: [ "$vals", "$p25Index" ] },
        p50 : { $arrayElemAt: [ "$vals", "$p50Index" ] },
        p75 : { $arrayElemAt: [ "$vals", "$p75Index" ] },
        max : { $max: "$vals" }
     }
 },{ $set: {    // compute 
        irq: { $subtract: ["$p75", "$p25"] },
        mean: { $avg: "$vals" },
        stdDev: { $stdDevPop: "$vals" }
    }
  },{ $set: {
        of_min: { $subtract: [ "$p25" , { $multiply: [ "$irq", 3.0] }] },
        if_min: { $subtract: [ "$p25" , { $multiply: [ "$irq", 1.5] }] }, 
        if_max: { $add: [ "$p75" , { $multiply: [ "$irq", 1.5] }] }, 
        of_max: { $add: [ "$p75" , { $multiply: [ "$irq", 3.0] }] } 
    }
  },{ $project: {   // outliers by filtering
        name:1,
        count : 1,   
        vals : 1,
        if_min : 1,
        if_max : 1,
        of_min : 1,
        of_max : 1,
        min: 1,
        p25:1,
        p50:1,
        p75:1,
        max: 1,
        irq:1,
        mean:1,
        stdDev:1,
        count_missing:1,
        minor_outliers: {
            $filter: {
               input: "$vals",
               cond: { $or:  [
                       { $and: [ { $lt: [ "$$this", "$if_min"] }, { $gt: [ "$$this", "$of_min"] } ] } ,
                       { $and: [ { $lt: [ "$$this", "$of_max"] }, { $gt: [ "$$this", "$if_max"] } ] } 
                       ]}
            }
         },
         major_outliers: {
            $filter: {
               input: "$vals",
               cond: { $or: [ { $lt: [ "$$this", "$of_min"] }, { $gt: [ "$$this", "$of_max"] } ]  }
            }
         }
      }
  },{ 
    $project: {    // remove not needed fields
        p25Index : 0,
        p50Index : 0,
        p75Index : 0,
//         if_min : 0,
//         if_max : 0,
//         of_min : 0,
//         of_max : 0,
        vals: 0,
              
    }
}], { allowDiskUse: true } )
  
   */
  static List<Bson> raw_stats(String to_collection) {
    // TODO create Java Aggregate
    return null;
  }


  /*
  db.getCollection('raw').aggregate([
 {   $match : { //match a two datasets
        $or: [ {"metadata.name": "pd"}, {"metadata.name": "da"}  ] 
     }
 },
 { $project: {         
        values : 1,
        _id : 1,
        mean: { $avg: "$values"},
        sd: { $stdDevPop: "$values" },
        n: { $size: "$values"},
        name : "$metadata.name"
      }
  },{
     $group: {
        _id: 0,
        n: {$last: "$n"},
        mean_x: {$last: "$mean"},
        mean_y: {$first: "$mean"},
        sd_x: { $last: "$sd" },
        sd_y: { $first: "$sd" },
        values_x: {$last: "$values"},
        values_y: {$first: "$values"},
        name_x: {$last: "$name"},
        name_y: {$first: "$name"},
    }   
},{
    $project: {
        values_x:1,
        values_y:1,
        mean_x:1,
        mean_y:1,
        sd_x:1,
        sd_y:1,
        name_x:1,
        name_y:1,
        n:1,
        dp: {
            $reduce: {
              input: { $range: [ 0, "$n"] },
              initialValue: 0,
              in: { $add: [ "$$value", { $multiply: [ { $arrayElemAt: [ "$values_x", "$$this" ] }, { $arrayElemAt: [ "$values_y", "$$this" ] } ] } ] }
            }
         },   
//         x_min_mean: {
//             $map: { input: "$values_x", as: "x", in: { $subtract: [ "$$x", "$mean_x"] }} 
//         },
//         y_min_mean: {
//             $map: { input: "$values_y", as: "y", in: { $subtract: [ "$$y", "$mean_y"] }} 
//         }
// 
    }
},{ 
    $set: {
       na: { $concat: [ "$name_x", "-", "$name_y"] }, 
       r: { $divide: [
              { $subtract: [ "$dp", { $multiply: ["$n", "$mean_x", "$mean_y"] }]},
              { $multiply: [ "$sd_x", "$sd_y", { $subtract: [ "$n" , 1 ]}] }
            ]
        }
    }
},{
    $group: {
        _id:0,
        pearson_correlation: { $addToSet: { vars: "$na", r: "$r"} }
    }
 }], { allowDiskUse:  true })
  
   */
  static List<Bson> pearson(String to_collection, String var1, String var2) {
    // TODO create Java Aggregate
    return null;
  }

}