TextParser.java [src/csip/utils] Revision:   Date:
/*
 * $Id: 2.8+9 TextParser.java c152fb842f98 2023-11-28 od $
 *
 * This file is part of the Cloud Services Integration Platform (CSIP),
 * a Model-as-a-Service framework, API and application suite.
 *
 * 2012-2022, Olaf David and others, OMSLab, Colorado State University.
 *
 * OMSLab licenses this file to you under the MIT license.
 * See the LICENSE file in the project root for more information.
 */
package csip.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.DoubleStream;
import java.util.stream.IntStream;

/**
 * Extract tokens from an ASCII file (usually some kind of model output)
 *
 * @author od
 */
public class TextParser implements AutoCloseable {

  public static final String WS_SEP = "\\s+";
  public static final String COMMA_SEP = "\\s*,\\s*";

  public static final Pattern EMPTY = Pattern.compile("^$");
  public static final Pattern START_HASH = Pattern.compile("^#.*$");
  public static final Pattern START_SLASH_SLASH = Pattern.compile("^//.*$");

  BufferedReader r;
  String line = "";
  String name;
  int lineno;
  boolean autoclose = true;
  boolean isClosed = false;

  public TextParser(File file) throws FileNotFoundException {
    r = new BufferedReader(new FileReader(file));
    name = file.toString();
  }

  public TextParser(Path path) throws FileNotFoundException {
    r = new BufferedReader(new FileReader(path.toFile()));
    name = path.toString();
  }

  public TextParser(InputStream is, String name) throws IOException {
    r = new BufferedReader(new InputStreamReader(is));
    this.name = name;
  }

  public TextParser(File file, int buffsz) throws FileNotFoundException {
    r = new BufferedReader(new FileReader(file), buffsz);
    name = file.toString();
  }

  public TextParser(String content, String name) {
    r = new BufferedReader(new StringReader(content));
    this.name = name;
  }

  public TextParser nextLine() throws IOException {
    return nextLine(1);
  }

  public TextParser nextLineSkipEmpty() throws IOException {
    do {
      nextLine(1);
    } while (line != null && line.trim().isEmpty());
    return this;
  }

  public TextParser nextLineSkip(Pattern... pattern) throws IOException {
    boolean skip = false;
    do {
      nextLine(1);
      if (line != null) {
        skip = false;
        for (Pattern p : pattern) {
          if (p.matcher(line).matches()) {
            skip = true;
            break;
          }
        }
      }
    } while (line != null && skip);
    return this;
  }

  public TextParser nextLine(int lines) throws IOException {
    if (lines < 1)
      throw new IllegalArgumentException("error: lines < 1");

    String err = "Cannot skip " + lines + " in " + name;
    for (int i = 0; i < lines; i++) {
      line = readLine(err);
      if (line == null)
        break;
    }
    return this;
  }

  public TextParser toLineContaining(String text) throws IOException {
    String err = "Not found in " + name + ": " + text;
    do {
      line = readLine(err);
    } while (line != null && !line.contains(text));
//    if (line == null)
//      throw new IOException(err);
    return this;
  }

  public TextParser toLineContainingAllOf(String... text) throws IOException {
    String err = "Not found in " + name + ": " + Arrays.toString(text);
    boolean all_match = true;
    do {
      line = readLine(err);
      if (line != null) {
        all_match = true;
        for (String t : text) {
          if (!line.contains(t)) {
            all_match = false;
            break;
          }
        }
      }
    } while (line != null && !all_match);
    return this;
  }

  public TextParser toLineStartingWith(String text) throws IOException {
    String err = "Not found in " + name + ": " + text;
    do {
      line = readLine(err);
    } while (line != null && !line.startsWith(text));
    return this;
  }

  public TextParser toLineEndingWith(String text) throws IOException {
    String err = "Not found in " + name + ": " + text;
    do {
      line = readLine(err);
    } while (line != null && !line.endsWith(text));
    return this;
  }

  public TextParser toLineMatching(String regex) throws IOException {
    String err = "No match found in " + name + ": " + regex;
    Pattern p = Pattern.compile(regex);
    do {
      line = readLine(err);
    } while (line != null && !p.matcher(line).matches());
    return this;
  }

  public TextParser skipLinesContaining(String text) throws IOException {
    String err = "Skipping lines for" + name + ": " + text;
    do {
      line = readLine(err);
    } while (line != null && line.contains(text));
    return this;
  }

  public TextParser skipLinesStartingWith(String text) throws IOException {
    String err = "Skipping lines starting for" + name + ": " + text;
    do {
      line = readLine(err);
    } while (line != null && line.startsWith(text));
    return this;
  }

  /**
   * Set this to false if terminators should not close the stream
   *
   * @param autoclose if the terminating operation closes the stream
   * @return this instance
   */
  public TextParser autoClose(boolean autoclose) {
    this.autoclose = autoclose;
    return this;
  }

  /**
   * Get the part of the line string right of the text argument.
   *
   * @param text the text to look for
   * @return the substring within the line.
   * @deprecated
   */
  public TextParser rightOfFirst(String text) {
    int st = line.indexOf(text);
    if (st == -1)
      throw new IllegalArgumentException("text not found in " + line + ": " + text);
    line = line.substring(st + text.length());
    return this;
  }

  /**
   * Get the part of the line string right of the text argument.
   *
   * @param text the text to look for
   * @return the substring within the line.
   */
  public TextParser rightOf(String text) {
    int st = line.indexOf(text);
    if (st == -1)
      throw new IllegalArgumentException("text not found in " + line + ": " + text);
    line = line.substring(st + text.length());
    return this;
  }

  public TextParser rightOfLast(String text) {
    int st = line.lastIndexOf(text);
    if (st == -1)
      throw new IllegalArgumentException("text not found in " + line + ": " + text);
    line = line.substring(st + text.length());
    return this;
  }

  /**
   * Get the part of the line string left of the text argument.
   *
   * @param text the text to look for
   * @return the substring within the line.
   * @deprecated
   */
  public TextParser leftOfFirst(String text) {
    int st = line.indexOf(text);
    if (st == -1)
      throw new IllegalArgumentException("text not found in " + line + ": " + text);
    line = line.substring(0, st);
    return this;
  }

  /**
   * Get the part of the line string left of the text argument.
   *
   * @param text the text to look for
   * @return the substring within the line.
   */
  public TextParser leftOf(String text) {
    int st = line.indexOf(text);
    if (st == -1)
      throw new IllegalArgumentException("text not found in " + line + ": " + text);
    line = line.substring(0, st);
    return this;
  }

  public TextParser leftOfLast(String text) {
    int st = line.lastIndexOf(text);
    if (st == -1)
      throw new IllegalArgumentException("text not found in " + line + ": " + text);
    line = line.substring(0, st);
    return this;
  }

  public TextParser replaceAll(String regex, String text) {
    line = line.replaceAll(regex, text);
    return this;
  }

  public TextParser replace(String target, String text) {
    line = line.replace(target, text);
    return this;
  }

  public TextParser replaceFirst(String regex, String text) {
    line = line.replaceFirst(regex, text);
    return this;
  }

  public TextParser trim() {
    line = line.trim();
    return this;
  }

  public TextParser toUpperCase() {
    line = line.toUpperCase();
    return this;
  }

  public TextParser toLowerCase() {
    line = line.toLowerCase();
    return this;
  }

  int markedLineNo = -1;

  public TextParser mark() {
    try {
      r.mark(100000);
      markedLineNo = lineno;
    } catch (IOException ex) {
      Logger.getLogger(TextParser.class.getName()).log(Level.SEVERE, null, ex);
    }
    return this;
  }

  public List<String> linesFromMark() throws IOException {
    if (markedLineNo == -1)
      throw new IOException("Not previously marked.");

    List<String> l = new ArrayList<>();
    int lines = lineno - markedLineNo;
    if (lines < 1)
      throw new IllegalArgumentException("error: lines < 1");

    r.reset();
    lineno = markedLineNo;

    String err = "Cannot skip " + lines + " in " + name;
    for (int i = 0; i < lines; i++) {
      l.add(line = readLine(err));
    }
    ac();
    return l;
  }

  /**
   * Create a new (!) TextParser with the all content from the current position
   * until the text. 'text' is included.
   *
   * @param text the text to look for
   * @return a new instance.
   * @throws IOException if reading fails
   */
  public TextParser allUntil(String text) throws IOException {
    String err = "Not found in " + name + ": " + text;
    String subContent = line + '\n';
    do {
      line = readLine(err);
      subContent += line + '\n';
    } while (line != null && !line.contains(text));
    if (line == null)
      return null;
    return new TextParser(subContent, "until");
  }

  // Terminating operations. they close the stream.
  //
  @Deprecated
  public Tokens tokens() {
    return tokens(WS_SEP);
  }

  @Deprecated
  public Tokens tokens(String regex) {
    ac();
    return new Tokens(line, regex);
  }

  public Tokens split() {
    return split(WS_SEP);
  }

  public Tokens split(String regex) {
    ac();
    return new Tokens(line, regex);
  }

  public String asString() {
    ac();
    return line;
  }

  public double asDouble() {
    ac();
    return Double.parseDouble(line.trim());
  }

  public int asInteger() {
    ac();
    return Integer.parseInt(line.trim());
  }

  public int getLineNo() {
    ac();
    return lineno;
  }

  public String getWsTokenAt(int col) {
    ac();
    return extractWSToken(line, col);
  }

  public String getCSVTokenAt(int col) {
    ac();
    return extractCSVToken(line, col);
  }

  private void ac() {
    if (autoclose)
      close();
  }

  @Override
  public synchronized void close() {
    if (r != null) {
      try {
        r.close();
      } catch (IOException ex) {
      } finally {
        isClosed = true;
      }
    }
  }

  public boolean notEOF() {
    return !isClosed;
  }

  public boolean isEOF() {
    return isClosed;
  }

  public boolean notEmpty() {
    return !isClosed && !line.trim().isEmpty();
  }

  public boolean isEmpty() {
    return !isClosed && line.trim().isEmpty();
  }

  private String readLine(String err) throws IOException {
    if (isClosed)
      throw new IOException(name + ": already closed. Use autoClose(false) to allow for successive reads. Do "
          + "not forget to close the stream at the end.\n" + err);
    String l = r.readLine();

    if (l == null)
      close();

    lineno++;
    return l;
  }

  @Override
  public String toString() {
    return lineno + ":  '" + line + "'";
  }

  /**
   * Fast token extract. tokens are separated by any number of white spaces.
   * Tokens cannot contain any white spaces, as in csv.
   *
   * @param line
   * @param col the column, starts with 0.
   * @return the token at column 'col'
   */
  static String extractWSToken(String line, int col) {
    if (col < 0)
      throw new IllegalArgumentException("col argument < 0.");

    int idx = -1;
    int tokens = -1;
    int len = line.length() - 1;
    while (++idx < len) {
      while (Character.isWhitespace(line.charAt(idx)) && idx++ < len) {
      }
      int start = idx;
      while ((!Character.isWhitespace(line.charAt(idx))) && idx++ < len) {
      }
      if (idx == len)
        idx++;
      if (++tokens == col)
        return line.substring(start, idx);

    }
    return null;
  }

  static String extractCSVToken(String line, int col) {
    if (col < 0)
      throw new IllegalArgumentException("col argument < 0.");

    int idx = -1;
    int tokens = -1;
    int len = line.length() - 1;
    while (++idx < len) {
      while (line.charAt(idx) == ',' && idx++ < len) {
      }
      int start = idx;
      while (line.charAt(idx) != ',' && idx++ < len) {
      }
      if (idx == len)
        idx++;
      if (++tokens == col)
        return line.substring(start, idx);

    }
    return null;
  }

//////////////
  public static class Tokens {

    String[] tok;
    static final String[] EMPTY = new String[]{};

    private Tokens(String line, String regex) {
      tok = (line == null) ? EMPTY : line.trim().split(regex);
    }

    public int count() {
      return tok.length;
    }

    /**
     * Find the first index of val. in the token array.
     *
     * @param val the string to match (equals)
     * @return the index of val, or -1 if not existent.
     */
    public int indexOf(String val) {
      for (int i = 0; i < tok.length; i++) {
        if (tok[i] == null && val == null)
          return i;
        if (tok[i] != null && tok[i].equals(val))
          return i;
      }
      return -1;
    }

    /**
     *
     * @param i
     * @return
     * @deprecated replace with rangeFrom()
     */
    @Deprecated
    public Tokens fromIndex(int i) {
      if (i < 0 || i > tok.length - 1)
        throw new IllegalArgumentException("invalid begin index:" + i);
      if (i == 0)
        return this;
      tok = Arrays.copyOfRange(tok, i, tok.length);
      return this;
    }

    public Tokens rangeFrom(int from) {
      if (from < 0 || from > tok.length - 1)
        throw new IllegalArgumentException("invalid begin index:" + from);
      if (from == 0)
        return this;
      tok = Arrays.copyOfRange(tok, from, tok.length);
      return this;
    }

    /**
     *
     * @param i
     * @return
     * @deprecated replace with rangeTo()
     */
    @Deprecated
    public Tokens toIndex(int i) {
      if (i > 0 || -i > tok.length - 1)
        throw new IllegalArgumentException("invalid end index, (must be negative): " + i);
      if (i == 0)
        return this;
      tok = Arrays.copyOfRange(tok, 0, tok.length + i);
      return this;
    }

    /**
     * Get the Tokens until the end.
     *
     * @param to must be negative (starts from the end of the string, a la
     * python)
     * @return this instance
     */
    public Tokens rangeTo(int to) {
      if (to > 0 || -to > tok.length - 1)
        throw new IllegalArgumentException("invalid end index, (must be negative): " + to);
      if (to == 0)
        return this;
      tok = Arrays.copyOfRange(tok, 0, tok.length + to);
      return this;
    }

    /**
     * Gets all tokens in the range 'from -> to'
     *
     * @param from index (inclusive)
     * @param to index (inclusive)
     * @return
     */
    public Tokens range(int from, int to) {
//      System.out.println(Arrays.toString(tok));
      if (from < 0 || from > tok.length - 1)
        throw new IllegalArgumentException("invalid 'from' index: " + from);
      if (to < 0 || to > tok.length - 1)
        throw new IllegalArgumentException("invalid 'to' index: " + to);
      if (to <= from)
        throw new IllegalArgumentException("invalid 'from/to' index :" + from + " " + to);

      if (from == 0 && to == tok.length - 1)
        return this;

      tok = Arrays.copyOfRange(tok, from, to + 1);
      return this;
    }

    public Tokens reverse() {
      int len = tok.length;
      for (int i = 0; i < len / 2; i++) {
        String tmp = tok[i];
        tok[i] = tok[len - i - 1];
        tok[len - i - 1] = tmp;
      }
      return this;
    }

    private DoubleStream dstream() {
      return Arrays.stream(tok).mapToDouble(Double::parseDouble);
    }

    private IntStream istream() {
      return Arrays.stream(tok).mapToInt(Integer::parseInt);
    }

    private List<String> tokenlist() {
      return Arrays.asList(tok);
    }

    public int intAt(int i) {
      return Integer.parseInt(tok[i]);
    }

    public double doubleAt(int i) {
      return Double.parseDouble(tok[i]);
    }

    public double sum() {
      return dstream().sum();
    }

    public double average() {
      return dstream().average().getAsDouble();
    }

    public double min() {
      return dstream().min().getAsDouble();
    }

    public double max() {
      return dstream().max().getAsDouble();
    }

    public double[] asDoubleArray() {
      return dstream().toArray();
    }

    /**
     * This method can handle null and empty tokens
     *
     * @return
     */
    public List<Double> asDoubleList() {
      return tokenlist().stream()
          .map(v -> (v == null || v.isEmpty()) ? null : Double.valueOf(v))
          .collect(Collectors.toList());
    }

    public int[] asIntArray() {
      return istream().toArray();
    }

    /**
     * This method can handle null and empty tokens
     *
     * @return
     */
    public List<Integer> asIntList() {
      return tokenlist().stream()
          .map(v -> (v == null || v.isEmpty()) ? null : Integer.valueOf(v))
          .collect(Collectors.toList());
    }

    public String[] asStringArray() {
      return tok;
    }

    public List<String> asStringList() {
      return tokenlist();
    }
  }
}