USGS_Data.java [src/java/cfa] Revision: 1b69320646d4eeb4b67833c13ef5e0bb175788f9 Date: Mon Sep 30 15:04:28 MDT 2013
package cfa;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebWindow;
import com.gargoylesoftware.htmlunit.WebWindowEvent;
import com.gargoylesoftware.htmlunit.WebWindowListener;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextArea;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* Last Updated: 2-November-2012
* @author Tyler Wible
* @since 21-June-2012
*/
public class USGS_Data {
/**
* Opens a web connection to USGS and returns the contents of a search for all flow data for the specific station and date range
* @param stationID the USGS station ID for the current station
* @param beginDate the user specified begin date for the station (yyyy-mm-dd format)
* @param endDate the user specified end date for the station (yyyy-mm-dd format)
* @return an ArrayList<String> containing the results of the search for flow data using the above inputs
* @throws IOException
*/
public ArrayList<String> DownloadFlowWebpage(String stationID, String beginDate, String endDate) throws IOException {
//Specify flow website from inputs
String flowWebsite = "http://waterdata.usgs.gov/nwis/dv?cb_00060=on&format=rdb&begin_date=" +
beginDate + "&end_date=" + endDate + "&site_no=" + stationID + "&referred_module=sw";
//Open the provided website
URL webpage = new URL(flowWebsite);
URLConnection yc = webpage.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
//Read out all of the webpage out into an ArrayList<String>
String inputLine;
ArrayList<String> pageData = new ArrayList<String>( );
while((inputLine = in.readLine()) != null){
pageData.add(inputLine);
}
in.close();
return pageData;
}
/**
* Get the flow webpage and loop through and pull out the flow data for the current station
* @param stationID the USGS station ID for the current station
* @param beginDate the user specified begin date for the station (yyyy-mm-dd format)
* @param endDate the user specified end date for the station (yyyy-mm-dd format)
* @return a String[][] containing column1 = date(yyyy-mm-dd), column2 = flowValue
* @throws IOException
*/
public String[][] USGS_read_FDC(String stationID, String beginDate, String endDate) throws IOException{
//Get the webpage of data for the USGS flow station
ArrayList<String> webpageAll = DownloadFlowWebpage(stationID, beginDate, endDate);
//Pull out new arraylist of only the desired data from the arraylist to return as the web page result
Iterator<String> iterate = webpageAll.iterator( );
ArrayList<String> textData = new ArrayList<String>();
while(iterate.hasNext()){
String temp_pageData = (String) iterate.next();
String[] f = temp_pageData.split("\t");
if ((f.length >= 4) && ("USGS".equals(f[0]))) {
boolean Ice = f[3].equalsIgnoreCase("Ice");
boolean Ssn = f[3].equalsIgnoreCase("Ssn");
boolean Dis = f[3].equalsIgnoreCase("Dis");
boolean rat = f[3].equalsIgnoreCase("Rat");
boolean eqp = f[3].equalsIgnoreCase("Eqp");
boolean other = f[3].equalsIgnoreCase("***");
boolean blank = f[3].equalsIgnoreCase("");
if (!Ice && !Ssn && !Dis && !rat && !eqp && !other && !blank) {
//Pull out only the data needed to pass between sub-functions
//f[1] = StationID
//f[2] = Date
//f[3] = FlowValue
textData.add(f[1] + "\t" + f[2] + "\t" + f[3]);
}
}
}
//convert Array list into String[][] array (column1 = date, column2 = value)
String[][] stringArray = new String[textData.size()][2];
for(int i=0; i<stringArray.length; i++){
String[] currentColumns = textData.get(i).split("\t");
//currentColumns[0] = stationID
//currentColumns[1] = date
//currentColumns[2] = value
stringArray[i][0] = currentColumns[1];
stringArray[i][1] = currentColumns[2];
}
return stringArray;
}
/**
* Opens a web connection to USGS and returns the contents of a search for all peak flow data for the specific station
* @param stationID the USGS station ID for the current station
* @return an ArrayList<String> containing the results of the search for flow data using the above inputs
* @throws IOException
*/
public ArrayList<String> DownloadPeakFlowWebpage(String stationID) throws IOException {
//Specify flow website from inputs
String peakWebsite = "http://nwis.waterdata.usgs.gov/nwis/peak?site_no=" + stationID + "&agency_cd=USGS&format=rdb";
//Open the provided website
URL webpage = new URL(peakWebsite);
URLConnection yc = webpage.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
//Read out all of the webpage out into an ArrayList<String>
String inputLine;
ArrayList<String> pageData = new ArrayList<String>( );
while((inputLine = in.readLine()) != null){
pageData.add(inputLine);
}
return pageData;
}
/**
* Get the flow webpage and loop through and pull out the flow data for the current station
* @param stationID the USGS station ID for the current station
* @param beginDate the user specified begin date for the station (yyyy-mm-dd format)
* @param endDate the user specified end date for the station (yyyy-mm-dd format)
* @return an ArrayList<String> containing the flow results tab deliminated (stationID \t date \t flowValue)
* @throws IOException
*/
public ArrayList<String> getUSGSPeakData(String stationID) throws IOException{
//Get peak flow data
ArrayList<String> peakWebPage = DownloadPeakFlowWebpage(stationID);
//Loop through and pull out the desired data
ArrayList<String> textData = new ArrayList<String>();
Iterator<String> iterate = peakWebPage.iterator( );
while(iterate.hasNext()){
String temp_pageData = (String) iterate.next();
String[] f = temp_pageData.split("\t");
if ((f.length >= 5) && ("USGS".equals(f[0]))) {
boolean Ice = f[4].equalsIgnoreCase("Ice");
boolean Ssn = f[4].equalsIgnoreCase("Ssn");
boolean Dis = f[4].equalsIgnoreCase("Dis");
boolean rat = f[4].equalsIgnoreCase("Rat");
boolean eqp = f[3].equalsIgnoreCase("Eqp");
boolean other = f[4].equalsIgnoreCase("***");
boolean blank = f[4].equalsIgnoreCase("");
if (!Ice && !Ssn && !Dis && !rat && !eqp && !other && !blank) {
//Keep only the rows which contain the desired values of "USGS StationNumber Date Time FlowValue"
//f[2] = date
//f[4] = peak flow (cfs)
textData.add(f[2] + "\t" + f[4]);
}
}
}
return textData;
}
/**
* Checks if the webpage contains the search keyword of not, usually used to find a webpage error and return a false value
* @param in the BufferedReader for the webpage
* @param keyword the String keyword that is looked for in the webpage
* @return returns true if the webpage contains the keyword, false otherwise
* @throws IOException
*/
public boolean getWQPage(BufferedReader in, String keyword) throws IOException{
String inputLine = "";
boolean containsKeyword = false;
// System.out.println("Current webpage: \n");
while((inputLine = in.readLine()) != null) {
if(inputLine.contains(keyword)){
// System.out.println(inputLine);
containsKeyword = true;
break;
}
}
return containsKeyword;
}
/**
* Opens a web connection to USGS and returns the contents of a search for all water quality data for the specific station
* @param stationID the USGS station ID for the current station
* @return an ArrayList<String> containing the results of the search for water quality data using the above input
* @throws IOException
*/
public ArrayList<String> DownloadWQwebpage(String stationID) throws IOException, InterruptedException {
//Specify flow website from inputs
// String WQWebsite = "http://waterdata.usgs.gov/nwis/nwisman/?site_no=" + stationID + "&agency_cd=USGS";
String WQWebsite = "http://nwis.waterdata.usgs.gov/usa/nwis/qwdata/?site_no=" + stationID + "&agency_cd=USGS&inventory_output=0&rdb_inventory_output=value&TZoutput=0&pm_cd_compare=Greater%20than&radio_parm_cds=all_parm_cds&qw_attributes=0&format=rdb&qw_sample_wide=0&rdb_qw_attributes=0&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list";
//Open the provided website
URL webpage = new URL(WQWebsite);
URLConnection yc = webpage.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
boolean webpageError = true;//assume the webpage has errored and try to reboot it
int sleepCounter = 0;
//Loop until the webpage exists
while(webpageError){
sleepCounter++;
webpageError = getWQPage(in, "Network Issues");
Thread.sleep(1000); // do nothing for 1000 milliseconds (1 second)
in.close();
yc = webpage.openConnection();
in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
//If waiting more than 15 seconds (arbirary time amount, change as needed), the page likely didn't load
//or the station search didn't work so end the program and report this error
if(sleepCounter > 15){
return new ArrayList<String>();
}
}
//Read out all of the webpage out into an ArrayList<String>
String inputLine;
ArrayList<String> pageData = new ArrayList<String>( );
while((inputLine = in.readLine()) != null){
pageData.add(inputLine);
// System.out.println(inputLine);
}
return pageData;
}
/**
* Opens a web connection to USGS and returns the contents of a search for all water quality data for the specific station
* @param stationID the USGS station ID for the current station
* @param wqTest the 5 digit USGS water qualiyt (WQ) test code that the user has requested for download
* @return an ArrayList containing the USGS provided header for the text as well as all the WQ data for the provided wqTest code
* @throws IOException
* @throws InterruptedException
*/
public ArrayList<String> DownloadPartialWQwebpage(String stationID, String wqTest) throws IOException, InterruptedException {
//Specify flow website from inputs
// String WQWebsite = "http://waterdata.usgs.gov/nwis/nwisman/?site_no=" + stationID + "&agency_cd=USGS";
String WQWebsite = "http://nwis.waterdata.usgs.gov/usa/nwis/qwdata/?site_no=" + stationID + "&agency_cd=USGS&inventory_output=0&rdb_inventory_output=value&TZoutput=0&pm_cd_compare=Greater%20than&radio_parm_cds=all_parm_cds&qw_attributes=0&format=rdb&qw_sample_wide=0&rdb_qw_attributes=0&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list";
//Open the provided website
URL webpage = new URL(WQWebsite);
URLConnection yc = webpage.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
boolean webpageError = true;//assume the webpage has errored and try to reboot it
int sleepCounter = 0;
//Loop until the webpage exists
while(webpageError){
sleepCounter++;
webpageError = getWQPage(in, "Network Issues");
Thread.sleep(1000); // do nothing for 1000 milliseconds (1 second)
in.close();
yc = webpage.openConnection();
in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
//If waiting more than 15 seconds (arbirary time amount, change as needed), the page likely didn't load
//or the station search didn't work so end the program and report this error
if(sleepCounter > 15){
return new ArrayList<String>();
}
}
//Read out all of the webpage out into an ArrayList<String>
String inputLine;
ArrayList<String> pageData = new ArrayList<String>( );
while((inputLine = in.readLine()) != null){
//Deliminate the row based on tabs
String[] f = inputLine.split("\t");
if(!f[0].equalsIgnoreCase("USGS")){
//If the current line does not start with "USGS" then it is part of the header that should be kept
pageData.add(inputLine);
// System.out.println(inputLine);
}else{
//If the current line does start with USGS, check if its row contains the proper WQ test and keep it if it matches
if(f[12].equalsIgnoreCase(wqTest)){
//f[12] = water quality test code
pageData.add(inputLine);
// System.out.println(inputLine);
}
}
}
in.close();
return pageData;
}
/**
* Opens a web connection to USGS and returns the contents of a search for all water quality data for the specific station.
* Note, this function uses HtmlUnit because the above DownloadWQwebpage stopped working recently
* @param stationID the USGS station ID for the current station
* @return an ArrayList<String> containing the results of the search for water quality data using the above input
* @throws IOException
*/
public ArrayList<String> DownloadWQwebpage_HtmlUnit(String stationID, String wqTestCode) throws IOException, InterruptedException {
//Specify flow website from inputs
String WQWebsite = "http://waterdata.usgs.gov/nwis/nwisman/?site_no=" + stationID + "&agency_cd=USGS";
//Create Webclient with specific properties for STORET webpage
final LinkedList<WebWindow> windows = new LinkedList<WebWindow>();
WebClient webClient = new WebClient();
webClient.setThrowExceptionOnScriptError(false);
webClient.addWebWindowListener(new WebWindowListener(){
public void webWindowClosed(WebWindowEvent event){
}
public void webWindowContentChanged(WebWindowEvent event){
}
public void webWindowOpened(WebWindowEvent event){
windows.add(event.getWebWindow());
}
});
//Get webpage
HtmlPage mainPage = null;
try {
mainPage = webClient.getPage(WQWebsite);
}catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
}catch (MalformedURLException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}
if(mainPage == null){
ArrayList<String> errorMessage = new ArrayList<String>();
errorMessage.add("Error: USGS_readWQData_0001\n Error retriving webpage: " + WQWebsite + "");
return errorMessage;
}
HtmlPage wqPage = null;
List<?> linkList1 = (List<?>) mainPage.getByXPath("//a[@href='/nwis/qwdata/?site_no=" + stationID + "']");
try{
if(linkList1.size() == 1){
HtmlAnchor selectAll = (HtmlAnchor) linkList1.get(0);
selectAll.focus();
wqPage = selectAll.click();
}else{
ArrayList<String> errorMessage = new ArrayList<String>();
errorMessage.add("Error: USGS_readWQData_0002\n There are no " + wqTestCode + " water quality tests for station: " + stationID);
return errorMessage;
}
}catch(IOException e){
ArrayList<String> errorMessage = new ArrayList<String>();
errorMessage.add("Error: USGS_readWQData_0003\n");
errorMessage.add(e.toString());
// System.out.println(e.toString());
return errorMessage;
}
System.out.println(wqPage.asXml());
//Enter parameter code for data search
HtmlTextArea wqCodeSearch = (HtmlTextArea) wqPage.getElementById("radio_multiple_parm_cds");
//Set focus on this element to allow the webpage's javascript to check the radio button corresponding to this element
wqCodeSearch.focus();
wqCodeSearch.setText(wqTestCode);
//Change the data format to a better format
HtmlSelect dataFormat = (HtmlSelect) wqPage.getElementById("qw_sample_wide");
//Set focus on this element to allow the webpage's javascript to check the radio button corresponding to this element
dataFormat.focus();
dataFormat.setSelectedAttribute("One result per row", true);
//Change the download to display in browser
HtmlSelect downloadType = (HtmlSelect) wqPage.getElementById("rdb_compr_id");
downloadType.setSelectedAttribute("Display in browser", true);
//Get the result page
TextPage resultPage = null;
List<?> submitList = (List<?>) wqPage.getByXPath("//input[@value='Submit']");
HtmlSubmitInput submitButton = (HtmlSubmitInput) submitList.get(0);
resultPage = submitButton.click();
//Extract data from result page
String resultPageContents = resultPage.getContent();
if(resultPageContents.contains("No valid parameter codes")){
ArrayList<String> errorMessage = new ArrayList<String>();
errorMessage.add("Error: USGS_readWQData_0004\n There are no " + wqTestCode + " water quality tests for station: " + stationID);
return errorMessage;
}
String[] resultPageRows = resultPageContents.split("\n");
ArrayList<String> pageData = new ArrayList<String>();
for(int i=0; i<resultPageRows.length; i++){
pageData.add(resultPageRows[i]);
}
return pageData;
}
/**
* Reduces all water quality data to just that of the requested parameter
* @param allData all water quality data for the earlier provided date range and station ID (column1 = date, column2 = wqTestcode, column3 = value)
* @param wqTestCode the requested water quality parameter
* @param beginDate the user defined begin date for data search
* @param endDate the user defined end date for data search
* @return
* @throws IOException
*/
public String[][] minimizeUSGSWQdata(String[][] allData, String wqTestCode, String beginDate, String endDate) throws IOException{
int ctr = 0;
for(int i=0; i<allData.length; i++){
if(i != 0){//Ignore the first row containing the station name
if(allData[i][1].equalsIgnoreCase(wqTestCode) &&
(allData[i][0].compareTo(beginDate) >= 0) && (allData[i][0].compareTo(endDate) <= 0)){
ctr++;
}
}
}
String[][] reducedData = new String[ctr][2];
ctr=0;
for(int i=0; i<allData.length; i++){
if(i != 0){//Ignore the first row containing the station name
if(allData[i][1].equalsIgnoreCase(wqTestCode) &&
(allData[i][0].compareTo(beginDate) >= 0) && (allData[i][0].compareTo(endDate) <= 0)){
reducedData[ctr][0] = allData[i][0];//date
reducedData[ctr][1] = allData[i][2];//WQ test result value
ctr++;
}
}
}
return reducedData;
}
/**
* Merges the two arrays into a single array of returnArray.length = (array1.length + array2.length).
* Note that this only combines the output array of "minimizeUSGSWQdata" which is a 2 column String[][]
* with dates in the first column and values in the second, which matches the output format
* @param array1 first String[][] array to be combined column1 = dates, column2 = values
* @param array2 second String[][] array to be combined column1 = dates, column2 = values
* @return a combined array of array1 and array2 with the same number of columns and returnArray.length = (array1.length + array2.length)
*/
public String[][] mergeMinimizedWQdata(String[][] array1, String[][] array2){
String[][] newArray = new String[array1.length + array2.length][2];
for(int i=0; i<newArray.length; i++){
if(i<array1.length){
newArray[i][0] = array1[i][0];
newArray[i][1] = array1[i][1];
}else{
newArray[i][0] = array2[i-array1.length][0];
newArray[i][1] = array2[i-array1.length][1];
}
}
return newArray;
}
/**
* Get the water quality webpage and loop through and pull out the water quality data for the current station
* @param stationID the USGS station ID for the current station
* @return a String[][] containing column1 = date(yyyy-mm-dd), column2 = flowValue
* @throws IOException
* @throws InterruptedException
*/
public String[][] USGS_read_LDC(String stationID) throws IOException, InterruptedException{
//Get the webpage of data for the USGS flow station
ArrayList<String> webpageAll = DownloadWQwebpage(stationID);
// ArrayList<String> webpageAll = DownloadWQwebpage_HtmlUnit(stationID, wqTestCode);
//Pull out new arraylist of only the desired data from the arraylist to return as the web page result
Iterator<String> iterate = webpageAll.iterator( );
ArrayList<String> textData = new ArrayList<String>();
while(iterate.hasNext()){
String temp_pageData = (String) iterate.next();
String[] f = temp_pageData.split("\t");
if ( (f.length >= 15) && (f[0].equals("USGS")) ) {
String WQSample_code = f[12];
String WQSample_result = f[14];
boolean A = WQSample_code.equals("");
boolean B = WQSample_result.equals("");
if (!A && !B){
//Count only the rows which contain the desired values of "agency_cd site_no sample_dt...
// sample_tm sample_end_dt sample_end_tm sample_start_time_datum_cd tm_datum_rlbty_cd...
// coll_ent_cd medium_cd tu_id body_part_id parm_cd remark_cd result_va"
//Pull out only the data needed to pass between sub-functions
//f[1] = stationID
//f[2] = date
//f[12] = water quality test code
//f[14] = water quality test value
textData.add(f[1] + "\t" + f[2] + "\t" + f[12] + "\t" + f[14]);
}
}
}
//convert Array list into String[][] array (column1 = date, column2 = value)
String[][] stringArray = new String[textData.size()][3];
for(int i=0; i<textData.size(); i++){
String[] currentColumns = textData.get(i).split("\t");
//currentColumns[0] = stationID
//currentColumns[1] = date
//currentColumns[2] = water quality test code
//currentColumns[3] = water quality test value
stringArray[i][0] = currentColumns[1];//date
stringArray[i][1] = currentColumns[2];//test code
stringArray[i][2] = currentColumns[3];//value
}
return stringArray;
}
}