TextParser.java [src/csip/utils] Revision: Date:
/*
* $Id: 2.8+9 TextParser.java c152fb842f98 2023-11-28 od $
*
* This file is part of the Cloud Services Integration Platform (CSIP),
* a Model-as-a-Service framework, API and application suite.
*
* 2012-2022, Olaf David and others, OMSLab, Colorado State University.
*
* OMSLab licenses this file to you under the MIT license.
* See the LICENSE file in the project root for more information.
*/
package csip.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.DoubleStream;
import java.util.stream.IntStream;
/**
* Extract tokens from an ASCII file (usually some kind of model output)
*
* @author od
*/
public class TextParser implements AutoCloseable {
public static final String WS_SEP = "\\s+";
public static final String COMMA_SEP = "\\s*,\\s*";
public static final Pattern EMPTY = Pattern.compile("^$");
public static final Pattern START_HASH = Pattern.compile("^#.*$");
public static final Pattern START_SLASH_SLASH = Pattern.compile("^//.*$");
BufferedReader r;
String line = "";
String name;
int lineno;
boolean autoclose = true;
boolean isClosed = false;
public TextParser(File file) throws FileNotFoundException {
r = new BufferedReader(new FileReader(file));
name = file.toString();
}
public TextParser(Path path) throws FileNotFoundException {
r = new BufferedReader(new FileReader(path.toFile()));
name = path.toString();
}
public TextParser(InputStream is, String name) throws IOException {
r = new BufferedReader(new InputStreamReader(is));
this.name = name;
}
public TextParser(File file, int buffsz) throws FileNotFoundException {
r = new BufferedReader(new FileReader(file), buffsz);
name = file.toString();
}
public TextParser(String content, String name) {
r = new BufferedReader(new StringReader(content));
this.name = name;
}
public TextParser nextLine() throws IOException {
return nextLine(1);
}
public TextParser nextLineSkipEmpty() throws IOException {
do {
nextLine(1);
} while (line != null && line.trim().isEmpty());
return this;
}
public TextParser nextLineSkip(Pattern... pattern) throws IOException {
boolean skip = false;
do {
nextLine(1);
if (line != null) {
skip = false;
for (Pattern p : pattern) {
if (p.matcher(line).matches()) {
skip = true;
break;
}
}
}
} while (line != null && skip);
return this;
}
public TextParser nextLine(int lines) throws IOException {
if (lines < 1)
throw new IllegalArgumentException("error: lines < 1");
String err = "Cannot skip " + lines + " in " + name;
for (int i = 0; i < lines; i++) {
line = readLine(err);
if (line == null)
break;
}
return this;
}
public TextParser toLineContaining(String text) throws IOException {
String err = "Not found in " + name + ": " + text;
do {
line = readLine(err);
} while (line != null && !line.contains(text));
// if (line == null)
// throw new IOException(err);
return this;
}
public TextParser toLineContainingAllOf(String... text) throws IOException {
String err = "Not found in " + name + ": " + Arrays.toString(text);
boolean all_match = true;
do {
line = readLine(err);
if (line != null) {
all_match = true;
for (String t : text) {
if (!line.contains(t)) {
all_match = false;
break;
}
}
}
} while (line != null && !all_match);
return this;
}
public TextParser toLineStartingWith(String text) throws IOException {
String err = "Not found in " + name + ": " + text;
do {
line = readLine(err);
} while (line != null && !line.startsWith(text));
return this;
}
public TextParser toLineEndingWith(String text) throws IOException {
String err = "Not found in " + name + ": " + text;
do {
line = readLine(err);
} while (line != null && !line.endsWith(text));
return this;
}
public TextParser toLineMatching(String regex) throws IOException {
String err = "No match found in " + name + ": " + regex;
Pattern p = Pattern.compile(regex);
do {
line = readLine(err);
} while (line != null && !p.matcher(line).matches());
return this;
}
public TextParser skipLinesContaining(String text) throws IOException {
String err = "Skipping lines for" + name + ": " + text;
do {
line = readLine(err);
} while (line != null && line.contains(text));
return this;
}
public TextParser skipLinesStartingWith(String text) throws IOException {
String err = "Skipping lines starting for" + name + ": " + text;
do {
line = readLine(err);
} while (line != null && line.startsWith(text));
return this;
}
/**
* Set this to false if terminators should not close the stream
*
* @param autoclose if the terminating operation closes the stream
* @return this instance
*/
public TextParser autoClose(boolean autoclose) {
this.autoclose = autoclose;
return this;
}
/**
* Get the part of the line string right of the text argument.
*
* @param text the text to look for
* @return the substring within the line.
* @deprecated
*/
public TextParser rightOfFirst(String text) {
int st = line.indexOf(text);
if (st == -1)
throw new IllegalArgumentException("text not found in " + line + ": " + text);
line = line.substring(st + text.length());
return this;
}
/**
* Get the part of the line string right of the text argument.
*
* @param text the text to look for
* @return the substring within the line.
*/
public TextParser rightOf(String text) {
int st = line.indexOf(text);
if (st == -1)
throw new IllegalArgumentException("text not found in " + line + ": " + text);
line = line.substring(st + text.length());
return this;
}
public TextParser rightOfLast(String text) {
int st = line.lastIndexOf(text);
if (st == -1)
throw new IllegalArgumentException("text not found in " + line + ": " + text);
line = line.substring(st + text.length());
return this;
}
/**
* Get the part of the line string left of the text argument.
*
* @param text the text to look for
* @return the substring within the line.
* @deprecated
*/
public TextParser leftOfFirst(String text) {
int st = line.indexOf(text);
if (st == -1)
throw new IllegalArgumentException("text not found in " + line + ": " + text);
line = line.substring(0, st);
return this;
}
/**
* Get the part of the line string left of the text argument.
*
* @param text the text to look for
* @return the substring within the line.
*/
public TextParser leftOf(String text) {
int st = line.indexOf(text);
if (st == -1)
throw new IllegalArgumentException("text not found in " + line + ": " + text);
line = line.substring(0, st);
return this;
}
public TextParser leftOfLast(String text) {
int st = line.lastIndexOf(text);
if (st == -1)
throw new IllegalArgumentException("text not found in " + line + ": " + text);
line = line.substring(0, st);
return this;
}
public TextParser replaceAll(String regex, String text) {
line = line.replaceAll(regex, text);
return this;
}
public TextParser replace(String target, String text) {
line = line.replace(target, text);
return this;
}
public TextParser replaceFirst(String regex, String text) {
line = line.replaceFirst(regex, text);
return this;
}
public TextParser trim() {
line = line.trim();
return this;
}
public TextParser toUpperCase() {
line = line.toUpperCase();
return this;
}
public TextParser toLowerCase() {
line = line.toLowerCase();
return this;
}
int markedLineNo = -1;
public TextParser mark() {
try {
r.mark(100000);
markedLineNo = lineno;
} catch (IOException ex) {
Logger.getLogger(TextParser.class.getName()).log(Level.SEVERE, null, ex);
}
return this;
}
public List<String> linesFromMark() throws IOException {
if (markedLineNo == -1)
throw new IOException("Not previously marked.");
List<String> l = new ArrayList<>();
int lines = lineno - markedLineNo;
if (lines < 1)
throw new IllegalArgumentException("error: lines < 1");
r.reset();
lineno = markedLineNo;
String err = "Cannot skip " + lines + " in " + name;
for (int i = 0; i < lines; i++) {
l.add(line = readLine(err));
}
ac();
return l;
}
/**
* Create a new (!) TextParser with the all content from the current position
* until the text. 'text' is included.
*
* @param text the text to look for
* @return a new instance.
* @throws IOException if reading fails
*/
public TextParser allUntil(String text) throws IOException {
String err = "Not found in " + name + ": " + text;
String subContent = line + '\n';
do {
line = readLine(err);
subContent += line + '\n';
} while (line != null && !line.contains(text));
if (line == null)
return null;
return new TextParser(subContent, "until");
}
// Terminating operations. they close the stream.
//
@Deprecated
public Tokens tokens() {
return tokens(WS_SEP);
}
@Deprecated
public Tokens tokens(String regex) {
ac();
return new Tokens(line, regex);
}
public Tokens split() {
return split(WS_SEP);
}
public Tokens split(String regex) {
ac();
return new Tokens(line, regex);
}
public String asString() {
ac();
return line;
}
public double asDouble() {
ac();
return Double.parseDouble(line.trim());
}
public int asInteger() {
ac();
return Integer.parseInt(line.trim());
}
public int getLineNo() {
ac();
return lineno;
}
public String getWsTokenAt(int col) {
ac();
return extractWSToken(line, col);
}
public String getCSVTokenAt(int col) {
ac();
return extractCSVToken(line, col);
}
private void ac() {
if (autoclose)
close();
}
@Override
public synchronized void close() {
if (r != null) {
try {
r.close();
} catch (IOException ex) {
} finally {
isClosed = true;
}
}
}
public boolean notEOF() {
return !isClosed;
}
public boolean isEOF() {
return isClosed;
}
public boolean notEmpty() {
return !isClosed && !line.trim().isEmpty();
}
public boolean isEmpty() {
return !isClosed && line.trim().isEmpty();
}
private String readLine(String err) throws IOException {
if (isClosed)
throw new IOException(name + ": already closed. Use autoClose(false) to allow for successive reads. Do "
+ "not forget to close the stream at the end.\n" + err);
String l = r.readLine();
if (l == null)
close();
lineno++;
return l;
}
@Override
public String toString() {
return lineno + ": '" + line + "'";
}
/**
* Fast token extract. tokens are separated by any number of white spaces.
* Tokens cannot contain any white spaces, as in csv.
*
* @param line
* @param col the column, starts with 0.
* @return the token at column 'col'
*/
static String extractWSToken(String line, int col) {
if (col < 0)
throw new IllegalArgumentException("col argument < 0.");
int idx = -1;
int tokens = -1;
int len = line.length() - 1;
while (++idx < len) {
while (Character.isWhitespace(line.charAt(idx)) && idx++ < len) {
}
int start = idx;
while ((!Character.isWhitespace(line.charAt(idx))) && idx++ < len) {
}
if (idx == len)
idx++;
if (++tokens == col)
return line.substring(start, idx);
}
return null;
}
static String extractCSVToken(String line, int col) {
if (col < 0)
throw new IllegalArgumentException("col argument < 0.");
int idx = -1;
int tokens = -1;
int len = line.length() - 1;
while (++idx < len) {
while (line.charAt(idx) == ',' && idx++ < len) {
}
int start = idx;
while (line.charAt(idx) != ',' && idx++ < len) {
}
if (idx == len)
idx++;
if (++tokens == col)
return line.substring(start, idx);
}
return null;
}
//////////////
public static class Tokens {
String[] tok;
static final String[] EMPTY = new String[]{};
private Tokens(String line, String regex) {
tok = (line == null) ? EMPTY : line.trim().split(regex);
}
public int count() {
return tok.length;
}
/**
* Find the first index of val. in the token array.
*
* @param val the string to match (equals)
* @return the index of val, or -1 if not existent.
*/
public int indexOf(String val) {
for (int i = 0; i < tok.length; i++) {
if (tok[i] == null && val == null)
return i;
if (tok[i] != null && tok[i].equals(val))
return i;
}
return -1;
}
/**
*
* @param i
* @return
* @deprecated replace with rangeFrom()
*/
@Deprecated
public Tokens fromIndex(int i) {
if (i < 0 || i > tok.length - 1)
throw new IllegalArgumentException("invalid begin index:" + i);
if (i == 0)
return this;
tok = Arrays.copyOfRange(tok, i, tok.length);
return this;
}
public Tokens rangeFrom(int from) {
if (from < 0 || from > tok.length - 1)
throw new IllegalArgumentException("invalid begin index:" + from);
if (from == 0)
return this;
tok = Arrays.copyOfRange(tok, from, tok.length);
return this;
}
/**
*
* @param i
* @return
* @deprecated replace with rangeTo()
*/
@Deprecated
public Tokens toIndex(int i) {
if (i > 0 || -i > tok.length - 1)
throw new IllegalArgumentException("invalid end index, (must be negative): " + i);
if (i == 0)
return this;
tok = Arrays.copyOfRange(tok, 0, tok.length + i);
return this;
}
/**
* Get the Tokens until the end.
*
* @param to must be negative (starts from the end of the string, a la
* python)
* @return this instance
*/
public Tokens rangeTo(int to) {
if (to > 0 || -to > tok.length - 1)
throw new IllegalArgumentException("invalid end index, (must be negative): " + to);
if (to == 0)
return this;
tok = Arrays.copyOfRange(tok, 0, tok.length + to);
return this;
}
/**
* Gets all tokens in the range 'from -> to'
*
* @param from index (inclusive)
* @param to index (inclusive)
* @return
*/
public Tokens range(int from, int to) {
// System.out.println(Arrays.toString(tok));
if (from < 0 || from > tok.length - 1)
throw new IllegalArgumentException("invalid 'from' index: " + from);
if (to < 0 || to > tok.length - 1)
throw new IllegalArgumentException("invalid 'to' index: " + to);
if (to <= from)
throw new IllegalArgumentException("invalid 'from/to' index :" + from + " " + to);
if (from == 0 && to == tok.length - 1)
return this;
tok = Arrays.copyOfRange(tok, from, to + 1);
return this;
}
public Tokens reverse() {
int len = tok.length;
for (int i = 0; i < len / 2; i++) {
String tmp = tok[i];
tok[i] = tok[len - i - 1];
tok[len - i - 1] = tmp;
}
return this;
}
private DoubleStream dstream() {
return Arrays.stream(tok).mapToDouble(Double::parseDouble);
}
private IntStream istream() {
return Arrays.stream(tok).mapToInt(Integer::parseInt);
}
private List<String> tokenlist() {
return Arrays.asList(tok);
}
public int intAt(int i) {
return Integer.parseInt(tok[i]);
}
public double doubleAt(int i) {
return Double.parseDouble(tok[i]);
}
public double sum() {
return dstream().sum();
}
public double average() {
return dstream().average().getAsDouble();
}
public double min() {
return dstream().min().getAsDouble();
}
public double max() {
return dstream().max().getAsDouble();
}
public double[] asDoubleArray() {
return dstream().toArray();
}
/**
* This method can handle null and empty tokens
*
* @return
*/
public List<Double> asDoubleList() {
return tokenlist().stream()
.map(v -> (v == null || v.isEmpty()) ? null : Double.valueOf(v))
.collect(Collectors.toList());
}
public int[] asIntArray() {
return istream().toArray();
}
/**
* This method can handle null and empty tokens
*
* @return
*/
public List<Integer> asIntList() {
return tokenlist().stream()
.map(v -> (v == null || v.isEmpty()) ? null : Integer.valueOf(v))
.collect(Collectors.toList());
}
public String[] asStringArray() {
return tok;
}
public List<String> asStringList() {
return tokenlist();
}
}
}