USGS_Data.java [src/java/cfa] Revision: 1b69320646d4eeb4b67833c13ef5e0bb175788f9  Date: Mon Sep 30 15:04:28 MDT 2013
package cfa;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebWindow;
import com.gargoylesoftware.htmlunit.WebWindowEvent;
import com.gargoylesoftware.htmlunit.WebWindowListener;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextArea;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
* Last Updated: 2-November-2012
* @author Tyler Wible
* @since 21-June-2012
*/
public class USGS_Data {
    /**
     * Opens a web connection to USGS and returns the contents of a search for all flow data for the specific station and date range
     * @param stationID  the USGS station ID for the current station
     * @param beginDate  the user specified begin date for the station (yyyy-mm-dd format)
     * @param endDate  the user specified end date for the station (yyyy-mm-dd format)
     * @return an ArrayList<String> containing the results of the search for flow data using the above inputs
     * @throws IOException
     */
    public ArrayList<String> DownloadFlowWebpage(String stationID, String beginDate, String endDate) throws IOException {
        //Specify flow website from inputs
        String flowWebsite = "http://waterdata.usgs.gov/nwis/dv?cb_00060=on&format=rdb&begin_date=" + 
                beginDate + "&end_date=" + endDate + "&site_no=" + stationID + "&referred_module=sw";

        //Open the provided website
        URL webpage = new URL(flowWebsite);
        URLConnection yc = webpage.openConnection();
        BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
        //Read out all of the webpage out into an ArrayList<String>
        String inputLine;
        ArrayList<String> pageData = new ArrayList<String>( );

        while((inputLine = in.readLine()) != null){
            pageData.add(inputLine);
        }
        in.close();

        return pageData;
    }
    /**
     * Get the flow webpage and loop through and pull out the flow data for the current station
     * @param stationID  the USGS station ID for the current station
     * @param beginDate  the user specified begin date for the station (yyyy-mm-dd format)
     * @param endDate  the user specified end date for the station (yyyy-mm-dd format)
     * @return  a String[][] containing column1 = date(yyyy-mm-dd), column2 = flowValue
     * @throws IOException
     */
    public String[][] USGS_read_FDC(String stationID, String beginDate, String endDate) throws IOException{
        //Get the webpage of data for the USGS flow station
        ArrayList<String> webpageAll = DownloadFlowWebpage(stationID, beginDate, endDate);

        //Pull out new arraylist of only the desired data from the arraylist to return as the web page result
        Iterator<String> iterate = webpageAll.iterator( );
        ArrayList<String> textData = new ArrayList<String>();

        while(iterate.hasNext()){
            String temp_pageData = (String) iterate.next();
            String[] f = temp_pageData.split("\t");

            if ((f.length >= 4) && ("USGS".equals(f[0]))) {
                boolean Ice = f[3].equalsIgnoreCase("Ice");
                boolean Ssn = f[3].equalsIgnoreCase("Ssn");
                boolean Dis = f[3].equalsIgnoreCase("Dis");
                boolean rat = f[3].equalsIgnoreCase("Rat");
                boolean eqp = f[3].equalsIgnoreCase("Eqp");
                boolean other = f[3].equalsIgnoreCase("***");
                boolean blank = f[3].equalsIgnoreCase("");
                if (!Ice && !Ssn && !Dis && !rat && !eqp && !other && !blank) {
                    //Pull out only the data needed to pass between sub-functions

                    //f[1] = StationID
                    //f[2] = Date
                    //f[3] = FlowValue
                    textData.add(f[1] + "\t" + f[2] + "\t" + f[3]);
                }
            }
        }

        //convert Array list into String[][] array (column1 = date, column2 = value)
        String[][] stringArray = new String[textData.size()][2];
        for(int i=0; i<stringArray.length; i++){
            String[] currentColumns = textData.get(i).split("\t");
            //currentColumns[0] = stationID
            //currentColumns[1] = date
            //currentColumns[2] = value
            stringArray[i][0] = currentColumns[1];
            stringArray[i][1] = currentColumns[2];
        }


        return stringArray;
}
    /**
     * Opens a web connection to USGS and returns the contents of a search for all peak flow data for the specific station
     * @param stationID  the USGS station ID for the current station
     * @return an ArrayList<String> containing the results of the search for flow data using the above inputs
     * @throws IOException
     */
    public ArrayList<String> DownloadPeakFlowWebpage(String stationID) throws IOException {
        //Specify flow website from inputs
        String peakWebsite = "http://nwis.waterdata.usgs.gov/nwis/peak?site_no=" + stationID + "&agency_cd=USGS&format=rdb";


        //Open the provided website
        URL webpage = new URL(peakWebsite);
        URLConnection yc = webpage.openConnection();
        BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
        //Read out all of the webpage out into an ArrayList<String>
        String inputLine;
        ArrayList<String> pageData = new ArrayList<String>( );

        while((inputLine = in.readLine()) != null){
            pageData.add(inputLine);
        }

        return pageData;
    }
    /**
     * Get the flow webpage and loop through and pull out the flow data for the current station
     * @param stationID  the USGS station ID for the current station
     * @param beginDate  the user specified begin date for the station (yyyy-mm-dd format)
     * @param endDate  the user specified end date for the station (yyyy-mm-dd format)
     * @return  an ArrayList<String> containing the flow results tab deliminated (stationID \t date \t flowValue)
     * @throws IOException
     */
    public ArrayList<String> getUSGSPeakData(String stationID) throws IOException{
        //Get peak flow data
        ArrayList<String> peakWebPage = DownloadPeakFlowWebpage(stationID);

        //Loop through and pull out the desired data
        ArrayList<String> textData = new ArrayList<String>();
        Iterator<String> iterate = peakWebPage.iterator( );

        while(iterate.hasNext()){
            String temp_pageData = (String) iterate.next();
            String[] f = temp_pageData.split("\t");

            if ((f.length >= 5) && ("USGS".equals(f[0]))) {
                boolean Ice = f[4].equalsIgnoreCase("Ice");
                boolean Ssn = f[4].equalsIgnoreCase("Ssn");
                boolean Dis = f[4].equalsIgnoreCase("Dis");
                boolean rat = f[4].equalsIgnoreCase("Rat");
                boolean eqp = f[3].equalsIgnoreCase("Eqp");
                boolean other = f[4].equalsIgnoreCase("***");
                boolean blank = f[4].equalsIgnoreCase("");
                if (!Ice && !Ssn && !Dis && !rat && !eqp && !other && !blank) {
                    //Keep only the rows which contain the desired values of "USGS StationNumber Date Time FlowValue"

                    //f[2] = date
                    //f[4] = peak flow (cfs)
                    textData.add(f[2] + "\t" + f[4]);
                }
            }
        }

        return textData;
    }
    /**
     * Checks if the webpage contains the search keyword of not, usually used to find a webpage error and return a false value
     * @param in  the BufferedReader for the webpage
     * @param keyword  the String keyword that is looked for in the webpage
     * @return  returns true if the webpage contains the keyword, false otherwise
     * @throws IOException
     */
    public boolean getWQPage(BufferedReader in, String keyword) throws IOException{
        String inputLine = "";

        boolean containsKeyword = false;
//        System.out.println("Current webpage: \n");
        while((inputLine = in.readLine()) != null) {
            if(inputLine.contains(keyword)){
//                System.out.println(inputLine);
                containsKeyword = true;
                break;
            }
        }
        return containsKeyword;
    }
    /**
     * Opens a web connection to USGS and returns the contents of a search for all water quality data for the specific station
     * @param stationID  the USGS station ID for the current station
     * @return an ArrayList<String> containing the results of the search for water quality data using the above input
     * @throws IOException
     */
    public ArrayList<String> DownloadWQwebpage(String stationID) throws IOException, InterruptedException {
        //Specify flow website from inputs
//      String WQWebsite = "http://waterdata.usgs.gov/nwis/nwisman/?site_no=" + stationID + "&agency_cd=USGS";
        String WQWebsite = "http://nwis.waterdata.usgs.gov/usa/nwis/qwdata/?site_no=" + stationID + "&agency_cd=USGS&inventory_output=0&rdb_inventory_output=value&TZoutput=0&pm_cd_compare=Greater%20than&radio_parm_cds=all_parm_cds&qw_attributes=0&format=rdb&qw_sample_wide=0&rdb_qw_attributes=0&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list";		

        //Open the provided website
        URL webpage = new URL(WQWebsite);
        URLConnection yc = webpage.openConnection();
        BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));

        boolean webpageError = true;//assume the webpage has errored and try to reboot it
        int sleepCounter = 0;
        //Loop until the webpage exists
        while(webpageError){
            sleepCounter++;
            webpageError = getWQPage(in, "Network Issues");
            Thread.sleep(1000); // do nothing for 1000 milliseconds (1 second)
            in.close();
            yc = webpage.openConnection();
            in = new BufferedReader(new InputStreamReader(yc.getInputStream()));

            //If waiting more than 15 seconds (arbirary time amount, change as needed), the page likely didn't load 
            //or the station search didn't work so end the program and report this error
            if(sleepCounter > 15){
                return new ArrayList<String>();
            }
        }

        //Read out all of the webpage out into an ArrayList<String>
        String inputLine;
        ArrayList<String> pageData = new ArrayList<String>( );
        while((inputLine = in.readLine()) != null){
            pageData.add(inputLine);
//            System.out.println(inputLine);
        }
        return pageData;
    }
    /**
     * Opens a web connection to USGS and returns the contents of a search for all water quality data for the specific station
     * @param stationID  the USGS station ID for the current station
     * @param wqTest  the 5 digit USGS water qualiyt (WQ) test code that the user has requested for download
     * @return  an ArrayList containing the USGS provided header for the text as well as all the WQ data for the provided wqTest code
     * @throws IOException
     * @throws InterruptedException
     */
    public ArrayList<String> DownloadPartialWQwebpage(String stationID, String wqTest) throws IOException, InterruptedException {

        //Specify flow website from inputs
//      String WQWebsite = "http://waterdata.usgs.gov/nwis/nwisman/?site_no=" + stationID + "&agency_cd=USGS";
        String WQWebsite = "http://nwis.waterdata.usgs.gov/usa/nwis/qwdata/?site_no=" + stationID + "&agency_cd=USGS&inventory_output=0&rdb_inventory_output=value&TZoutput=0&pm_cd_compare=Greater%20than&radio_parm_cds=all_parm_cds&qw_attributes=0&format=rdb&qw_sample_wide=0&rdb_qw_attributes=0&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list";		

        //Open the provided website
        URL webpage = new URL(WQWebsite);
        URLConnection yc = webpage.openConnection();
        BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));

        boolean webpageError = true;//assume the webpage has errored and try to reboot it
        int sleepCounter = 0;
        //Loop until the webpage exists
        while(webpageError){
            sleepCounter++;
            webpageError = getWQPage(in, "Network Issues");
            Thread.sleep(1000); // do nothing for 1000 milliseconds (1 second)
            in.close();
            yc = webpage.openConnection();
            in = new BufferedReader(new InputStreamReader(yc.getInputStream()));

            //If waiting more than 15 seconds (arbirary time amount, change as needed), the page likely didn't load 
            //or the station search didn't work so end the program and report this error
            if(sleepCounter > 15){
                return new ArrayList<String>();
            }
        }

        //Read out all of the webpage out into an ArrayList<String>
        String inputLine;
        ArrayList<String> pageData = new ArrayList<String>( );
        while((inputLine = in.readLine()) != null){
            //Deliminate the row based on tabs
            String[] f = inputLine.split("\t");
            if(!f[0].equalsIgnoreCase("USGS")){
                //If the current line does not start with "USGS" then it is part of the header that should be kept
                pageData.add(inputLine);
//                System.out.println(inputLine);

            }else{
                //If the current line does start with USGS, check if its row contains the proper WQ test and keep it if it matches
                if(f[12].equalsIgnoreCase(wqTest)){
                    //f[12] = water quality test code
                    pageData.add(inputLine);
//                    System.out.println(inputLine);
                }
            }
        }
        in.close();
        return pageData;
    }
    /**
     * Opens a web connection to USGS and returns the contents of a search for all water quality data for the specific station.
     * Note, this function uses HtmlUnit because the above DownloadWQwebpage stopped working recently
     * @param stationID  the USGS station ID for the current station
     * @return an ArrayList<String> containing the results of the search for water quality data using the above input
     * @throws IOException
     */
    public ArrayList<String> DownloadWQwebpage_HtmlUnit(String stationID, String wqTestCode) throws IOException, InterruptedException {
        //Specify flow website from inputs
        String WQWebsite = "http://waterdata.usgs.gov/nwis/nwisman/?site_no=" + stationID + "&agency_cd=USGS";

        //Create Webclient with specific properties for STORET webpage
        final LinkedList<WebWindow> windows =  new LinkedList<WebWindow>();
        WebClient webClient = new WebClient();
        webClient.setThrowExceptionOnScriptError(false);
        webClient.addWebWindowListener(new WebWindowListener(){
            public void webWindowClosed(WebWindowEvent event){
            }
            public void webWindowContentChanged(WebWindowEvent event){
            }
            public void webWindowOpened(WebWindowEvent event){
                windows.add(event.getWebWindow());
            }
        });
        //Get webpage
        HtmlPage mainPage = null;
        try {
            mainPage = webClient.getPage(WQWebsite);
        }catch (FailingHttpStatusCodeException e) {
            e.printStackTrace();
        }catch (MalformedURLException e) {
            e.printStackTrace();
        }catch (IOException e) {
            e.printStackTrace();
        }
        if(mainPage == null){
            ArrayList<String> errorMessage = new ArrayList<String>();
            errorMessage.add("Error: USGS_readWQData_0001\n Error retriving webpage: " + WQWebsite + "");
            return errorMessage;
        }

        HtmlPage wqPage = null;
        List<?> linkList1 = (List<?>) mainPage.getByXPath("//a[@href='/nwis/qwdata/?site_no=" + stationID + "']");
        try{
            if(linkList1.size() == 1){
                HtmlAnchor selectAll = (HtmlAnchor) linkList1.get(0);
                selectAll.focus();
                wqPage = selectAll.click();
            }else{
                ArrayList<String> errorMessage = new ArrayList<String>();
                errorMessage.add("Error: USGS_readWQData_0002\n There are no " + wqTestCode + " water quality tests for station: " + stationID);
                return errorMessage;
            }
        }catch(IOException e){
            ArrayList<String> errorMessage = new ArrayList<String>();
            errorMessage.add("Error: USGS_readWQData_0003\n");
            errorMessage.add(e.toString());
//            System.out.println(e.toString());
            return errorMessage;
        }

        System.out.println(wqPage.asXml());
        //Enter parameter code for data search
        HtmlTextArea wqCodeSearch = (HtmlTextArea) wqPage.getElementById("radio_multiple_parm_cds");
        //Set focus on this element to allow the webpage's javascript to check the radio button corresponding to this element
        wqCodeSearch.focus();
        wqCodeSearch.setText(wqTestCode);


        //Change the data format to a better format
        HtmlSelect dataFormat = (HtmlSelect) wqPage.getElementById("qw_sample_wide");
        //Set focus on this element to allow the webpage's javascript to check the radio button corresponding to this element
        dataFormat.focus();
        dataFormat.setSelectedAttribute("One result per row", true);


        //Change the download to display in browser
        HtmlSelect downloadType = (HtmlSelect) wqPage.getElementById("rdb_compr_id");
        downloadType.setSelectedAttribute("Display in browser", true);

        //Get the result page
        TextPage resultPage = null;
        List<?> submitList = (List<?>) wqPage.getByXPath("//input[@value='Submit']");
        HtmlSubmitInput submitButton = (HtmlSubmitInput) submitList.get(0);
        resultPage = submitButton.click();



        //Extract data from result page
        String resultPageContents = resultPage.getContent();
        if(resultPageContents.contains("No valid parameter codes")){
            ArrayList<String> errorMessage = new ArrayList<String>();
            errorMessage.add("Error: USGS_readWQData_0004\n There are no " + wqTestCode + " water quality tests for station: " + stationID);
            return errorMessage;
        }
        String[] resultPageRows = resultPageContents.split("\n");
        ArrayList<String> pageData = new ArrayList<String>();

        for(int i=0; i<resultPageRows.length; i++){
            pageData.add(resultPageRows[i]);
        }

        return pageData;
    }
    /**
     * Reduces all water quality data to just that of the requested parameter
     * @param allData  all water quality data for the earlier provided date range and station ID (column1 = date, column2 = wqTestcode, column3 = value)
     * @param wqTestCode  the requested water quality parameter
     * @param beginDate  the user defined begin date for data search
     * @param endDate  the user defined end date for data search
     * @return
     * @throws IOException 
     */
    public String[][] minimizeUSGSWQdata(String[][] allData, String wqTestCode, String beginDate, String endDate) throws IOException{
        int ctr = 0;
        for(int i=0; i<allData.length; i++){
            if(i != 0){//Ignore the first row containing the station name
                if(allData[i][1].equalsIgnoreCase(wqTestCode) &&
                  (allData[i][0].compareTo(beginDate) >= 0) && (allData[i][0].compareTo(endDate) <= 0)){
                    ctr++;
                }
            }
        }

        String[][] reducedData = new String[ctr][2];
        ctr=0;
        for(int i=0; i<allData.length; i++){
            if(i != 0){//Ignore the first row containing the station name
                if(allData[i][1].equalsIgnoreCase(wqTestCode) &&
                  (allData[i][0].compareTo(beginDate) >= 0) && (allData[i][0].compareTo(endDate) <= 0)){
                    reducedData[ctr][0] = allData[i][0];//date
                    reducedData[ctr][1] = allData[i][2];//WQ test result value

                    ctr++;
                }
            }
        }
        return reducedData;
    }
    /**
     * Merges the two arrays into a single array of returnArray.length = (array1.length + array2.length).  
     * Note that this only combines the output array of "minimizeUSGSWQdata" which is a 2 column String[][] 
     * with dates in the first column and values in the second, which matches the output format
     * @param array1  first String[][] array to be combined column1 = dates, column2 = values
     * @param array2  second String[][] array to be combined column1 = dates, column2 = values
     * @return a combined array of array1 and array2 with the same number of columns and returnArray.length = (array1.length + array2.length)
     */
    public String[][] mergeMinimizedWQdata(String[][] array1, String[][] array2){
        String[][] newArray = new String[array1.length + array2.length][2];

        for(int i=0; i<newArray.length; i++){
            if(i<array1.length){
                newArray[i][0] = array1[i][0];
                newArray[i][1] = array1[i][1];
            }else{
                newArray[i][0] = array2[i-array1.length][0];
                newArray[i][1] = array2[i-array1.length][1];
            }
        }

        return newArray;
    }
    /**
     * Get the water quality webpage and loop through and pull out the water quality data for the current station
     * @param stationID  the USGS station ID for the current station
     * @return  a String[][] containing column1 = date(yyyy-mm-dd), column2 = flowValue
     * @throws IOException
     * @throws InterruptedException
     */
    public String[][] USGS_read_LDC(String stationID) throws IOException, InterruptedException{

        //Get the webpage of data for the USGS flow station
        ArrayList<String> webpageAll = DownloadWQwebpage(stationID);
//      ArrayList<String> webpageAll = DownloadWQwebpage_HtmlUnit(stationID, wqTestCode);
	
        //Pull out new arraylist of only the desired data from the arraylist to return as the web page result
        Iterator<String> iterate = webpageAll.iterator( );
        ArrayList<String> textData = new ArrayList<String>();
        while(iterate.hasNext()){
            String temp_pageData = (String) iterate.next();
            String[] f = temp_pageData.split("\t");

            if ( (f.length >= 15) && (f[0].equals("USGS")) ) {
                String WQSample_code = f[12];
                String WQSample_result = f[14];
                boolean A = WQSample_code.equals("");
                boolean B = WQSample_result.equals("");
                if (!A && !B){
                    //Count only the rows which contain the desired values of "agency_cd	site_no	sample_dt...	
                    //	sample_tm	sample_end_dt	sample_end_tm	sample_start_time_datum_cd	tm_datum_rlbty_cd...	
                    //	coll_ent_cd	medium_cd	tu_id	body_part_id	parm_cd	remark_cd	result_va"

                    //Pull out only the data needed to pass between sub-functions
                    //f[1] = stationID
                    //f[2] = date
                    //f[12] = water quality test code
                    //f[14] = water quality test value
                    textData.add(f[1] + "\t" + f[2] + "\t" + f[12] + "\t" + f[14]);
                }
            }
        }

        //convert Array list into String[][] array (column1 = date, column2 = value)
        String[][] stringArray = new String[textData.size()][3];
        for(int i=0; i<textData.size(); i++){
            String[] currentColumns = textData.get(i).split("\t");
            //currentColumns[0] = stationID
            //currentColumns[1] = date
            //currentColumns[2] = water quality test code
            //currentColumns[3] = water quality test value

            stringArray[i][0] = currentColumns[1];//date
            stringArray[i][1] = currentColumns[2];//test code
            stringArray[i][2] = currentColumns[3];//value
        }

        return stringArray;
    }
}