package de.hawlitzek.logparser;

import java.util.*;
import java.util.zip.*;
import java.io.*;

import de.hawlitzek.util.text.*;

/**
 * This log analyzer reads a given http log and counts how many requests
 * come from a browser or search robot. It also allows to set filter
 * (e.g. for counting only html page requests or images) or to count
 * search requests from search engines. It also display which query items are used
 * to find your pages.
 * At last it prints a little statistic.
 * 
 * @author Florian Hawlitzek, Hawlitzek IT-Consulting GmbH<br>
 * Thanks to all contributors, especially to Aaron Holley!
 */
public class FHDetailHttpLogParser {

	public static final String VERSION = "1.2.1";
		
	/** statistic of the used browsers */
	private SortedMap browserStats = new TreeMap();
	/** statistic of the requested pages */	
	private SortedMap pageStats = new TreeMap();
	/** list of known search engines for the statistics */
	Properties searchengines = new Properties();
	/** statistic of used search engine */	
	private SortedMap searchEngineStats = new TreeMap();		
	/** statistic of google keywords */		
	private SortedMap googleStats = new TreeMap();		
 	/** number of all log entries */
	private int nrOcc = 0;
 	/** number of filtered page log entries */
	private int nrRealPageOcc = 0;
    /** identify unique IP numbers. */	
    private SortedMap hostIPs = new TreeMap();
    /** map of contained dates */	
    private SortedMap dateMap = new TreeMap();
    
    private static java.text.SimpleDateFormat dateFormat = new java.text.SimpleDateFormat("dd/MMM/yyyy", java.util.Locale.US);
    
	
	/** 
	 * constructor
	 */
	public FHDetailHttpLogParser() {
		InputStream is = this.getClass().getResourceAsStream("searchengines.properties");
		if (is!=null) {
			try {
				searchengines.load(is);
				is.close();
			}
			catch (IOException e) {
				e.printStackTrace();
			}
		}
		else
			System.err.println("Could not find search engine configuration file (searchengines.properties).");
	}

	/**
	 * abjust the statistic of browser occurences for a log entry
	 */
	protected void adjustBrowserStats(String browser) {
		  if (browserStats.containsKey(browser))
		  {
		  	// browser is already known -> increment
		  	BrowserEntry entry = (BrowserEntry)browserStats.get(browser);
		  	entry.addAnzVorkommen();
		  }
		  else
		  {
		  	// browser is not yet known -> add
		  	BrowserEntry be = new BrowserEntry(browser);
		  	browserStats.put(browser, be);
		  }
	}
	/**
	 * abjust the statistic of a referenced page or ressource
	 */
	protected void adjustPageStats(LogEntry logEntry) {
		
		String page = logEntry.getPage();
		if (pageStats.containsKey(page))
		{
		  	BrowserEntry entry = (BrowserEntry)pageStats.get(page);
		  	entry.addAnzVorkommen();
		}
		else
		{
		  	BrowserEntry be = new BrowserEntry(page);
		  	pageStats.put(page, be);
		}
	}
	/**
	 * abjust the statistic for a specific IP address
	 */
	protected void adjustForSingleIP(LogEntry logEntry) {
		
		String hostIP = logEntry.getClientIP();
		if (hostIPs.containsKey(hostIP))
		{
		  	CountedEntry entry = (CountedEntry)hostIPs.get(hostIP);
		  	entry.addAnzVorkommen();
		}
		else
		{
		  	CountedEntry he = new CountedEntry(hostIP);
		  	hostIPs.put(hostIP, he);
		}
	}
	/** 
	 * abjust the statistic for a specific date
	 */
	protected void adjustDate(LogEntry logEntry) {
		
		String date = dateFormat.format(logEntry.getDate());
		if (dateMap.containsKey(date))
		{
		  	CountedEntry entry = (CountedEntry)dateMap.get(date);
		  	entry.addAnzVorkommen();
		}
		else
		{
		  	CountedEntry de = new CountedEntry(date);
		  	dateMap.put(date, de);
		}
	}

	/**
	 * abjust the statistic of a referenced page or ressource
	 */
	protected void adjustSearchEngineStats(LogEntry logEntry) {
		
		String[] keywords = null;
		String referrer = logEntry.getReferrer();
		
		if (referrer==null || referrer.equals("-"))
			return;
		
		Iterator keys = searchengines.keySet().iterator();
		while (keys.hasNext()) {
			String key = (String)keys.next();
			if (referrer.indexOf(key)>=0) {
				if (searchEngineStats.containsKey(key)) {
		  			CountedEntry entry = (CountedEntry)searchEngineStats.get(key);
		  			entry.addAnzVorkommen();
				}
				else {
		  			CountedEntry be = new CountedEntry(key);
		  			searchEngineStats.put(key, be);
				}
				break;
			}
		}	
		
		// analyze the the search engine
		if (referrer.indexOf("google")>=0) {
		  	keywords = logEntry.parseGoogleKeywords();
		}
		else if (referrer.indexOf("fireball")>=0) {
		  	keywords = logEntry.parseFireballKeywords();
		}
		else if (referrer.indexOf("altavista")>=0) {
		  	keywords = logEntry.parseAltavistaKeywords();
		}
		
		// if the engine uses the google query syntax
		// extract keywords used for the search
		if (keywords!=null) {
		  	for (int i=0; i<keywords.length; i++) {
		
		   	  	if (googleStats.containsKey(keywords[i])) {
		   	  		CountedEntry hit = (CountedEntry)googleStats.get(keywords[i]);
		   	  		hit.addAnzVorkommen();
		   	  	}
		   	  	else {
		   	  		CountedEntry hit = new CountedEntry(keywords[i]);
			   	  	googleStats.put(keywords[i], hit);	
		   	  	}
		   	}
		}
		
	}
	/**
	 * display statistics
	 */	
	public void displayStats(String filterType)
	{
		StringBuffer result = new StringBuffer();
		
		result.append("All access count: " + getTotalOccurences() + "\n\n");

		result.append("browser statistics (in alphabetic order):\n\n");
		result.append(getBrowserStats(true));
	 	
		result.append("\nbrowser statistics (per frequency):\n\n");
		result.append(getBrowserStats(false));
		result.append("\n");

		if (isSearchEngineStats(filterType)) 
			result.append(getSearchEngineStats());
		else
			result.append(getPageStats());
		result.append("\n");				
        result.append(getIPStats());
		result.append("\n");        
        result.append(getDateStats());
		System.out.println(result.toString()); 		 	
	}
	/**
	 * display browser statistics
	 */	
	public String getBrowserStats(boolean alphabetic)
	{
		StringBuffer result = new StringBuffer();

		if (alphabetic) {
		 	Iterator it = browserStats.values().iterator();
		 	while (it.hasNext())
		 	{
		 		BrowserEntry be = (BrowserEntry)it.next();
		 		be.calcNrOccurencesRelative(nrRealPageOcc);
		 		result.append(be);
		 		result.append("\n");
		 	}
		}
		else {
			BrowserEntry[] hs = (BrowserEntry[])browserStats.values().toArray(new BrowserEntry[]{});
			Arrays.sort(hs, new CountedEntryComparator());
		 	for (int i=0; i<hs.length; i++)
		 	{
		 		BrowserEntry be = hs[i];
		 		result.append(be);
		 		result.append("\n");	 		
		 	}
		}
		return result.toString(); 		 	
	}
	/**
	 * retrun the number of log entries which match to the filter
	 */	
	public String getFilteredOccurences()
	{
		return String.valueOf(nrRealPageOcc);
	}
	/**
	 * display page statistics
	 */	
	public String getPageStats()
	{

		StringBuffer result = new StringBuffer();
		
		result.append("Filtered page counts: " + getFilteredOccurences() + "\n");

		if (nrRealPageOcc>0) {	
			result.append("\npage statistics (per frequency):\n\n");
			
			BrowserEntry[] ps = (BrowserEntry[])pageStats.values().toArray(new BrowserEntry[]{});
			Arrays.sort(ps, new CountedEntryComparator());
		 	for (int i=0; i<ps.length; i++)
		 	{
		 		BrowserEntry be = ps[i];
		 		result.append(be);
		 		result.append("\n");		 		
		 	}
		}
		return result.toString(); 		 	
	}
	
	/**
	 * display IP statistics
	 */	
	public String getIPStats()
	{

		StringBuffer result = new StringBuffer();

		result.append("IP statistics (per frequency):\n\n");
	
		CountedEntry[] hips = (CountedEntry[])hostIPs.values().toArray(new CountedEntry[]{});
		Arrays.sort(hips, new CountedEntryComparator());
		result.append("Total number of unique IPs: " + hips.length + "\n\n");
		for (int i=0; i < hips.length; i++)
		{
			CountedEntry he = hips[i];
			result.append(he);
			result.append("\n");		 		
		}
            
		return result.toString(); 	
	}
      
	/**
	 * display Date statistics
	 */	
	public String getDateStats()
	{
       
		StringBuffer result = new StringBuffer();
		
		result.append("Date statistics:\n\n");
		
		CountedEntry[] dates = (CountedEntry[])dateMap.values().toArray(new CountedEntry[]{});
		//Arrays.sort(DateEntries, new CountedEntryComparator());
		for (int i=0; i < dates.length; i++)
		{

			CountedEntry de = dates[i];
			result.append(de);
			result.append("\n");		 		
		}
            
		return result.toString(); 	
	}

	/**
	 * displays the search engine/google statistics
	 */	
	public String getSearchEngineStats()
	{
		StringBuffer result = new StringBuffer();
		
		
		if (!searchEngineStats.isEmpty()) {	
			CountedEntry[] se = (CountedEntry[])searchEngineStats.values().toArray(new CountedEntry[]{});
			Arrays.sort(se, new CountedEntryComparator());
		 	for (int i=0; i<se.length; i++)
		 	{
		 		CountedEntry be = se[i];
		 		result.append(searchengines.get(be.getId()));
		 		result.append(" search hits: ");		 		
		 		result.append(be.getNrOccurencesAbsolute());
		 		result.append("\n");		 		
		 	}
		}
		
		result.append("\n");

		if (!googleStats.isEmpty()) {	
			result.append("\nkeywords used in Google/Fireball/Altavista searches (per frequency):\n\n");
			
			CountedEntry[] ps = (CountedEntry[])googleStats.values().toArray(new CountedEntry[]{});
			Arrays.sort(ps, new CountedEntryComparator());
		 	for (int i=0; i<ps.length; i++)
		 	{
		 		CountedEntry ce = ps[i];
		 		result.append(ce);
		 		result.append("\n");		 		
		 	}
		}
		return result.toString(); 		 	
	}
	/**
	 * retrun the total number of log entries
	 */	
	public String getTotalOccurences()
	{
		return String.valueOf(nrOcc);
	}
	/**
	 * is this search using referrer or ressource for filtering?
	 * @return boolean
	 * @param filterType java.lang.String
	 */
	public static boolean isSearchEngineStats(String filterType) {
		return filterType.equals(LogEntryFilterTypes.REF);
	}
	/**
	 * main method
	 * @param args[0] the http log file
	 * @param args[n] optional filters
	 */
	public static void main(String[] args) {
		try {
			// read arguments
			int nrArgs = args.length;
			if (nrArgs == 0 || args[0] == null) showUsage();
			// is http log file available?
			File file = new File(args[0]);
			if (file == null || !file.exists() || !file.canRead()) showUsage();
			
			// look for filter arguments
			String filterType = null;
			String[] filterArgs = null;
			if (nrArgs>1) {
				filterType = args[1].toUpperCase();
				filterArgs = new String[nrArgs-2];
				for (int i=0; i<nrArgs-2;i++)
					filterArgs[i] = args[i+2];
			}

			// start parsing...
			FHDetailHttpLogParser runner = new FHDetailHttpLogParser();
			runner.readFile(file, filterType, filterArgs);
			runner.displayStats(filterType);
			//runner.parseLine("195.219.78.43 - - [01/Apr/2002:00:31:13 +0200] \"GET /Java_Downloads/java_downloads.html HTTP/1.0\" 304 - www.hawlitzek-consulting.de \"http://www.google.com/search?q=%22java+downloads%22&hl=en&start=10&sa=N\" \"Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt)\" \"202.124.207.180\"");
			//runner.parseLine("134.102.218.51 - - [22/Mar/2002:11:32:23 +0100] \"GET /Veroffentlichungen/Nitty_Gritty_Java/nitty_gritty_java.html HTTP/1.0\" 200 1407 www.hawlitzek-consulting.de \"http://www.google.de/search?q=nitty+gritty+java&ie=ISO-8859-1&oe=ISO-8859-1&hl=de&meta=\" \"Mozilla/4.79 [en] (X11; U; SunOS 5.8 sun4u)\" \"134.102.201.61\"");
		}
		catch (Exception e)
		{
			String message = e.getMessage();
			if (message != null)
				System.err.println(e.getMessage());
			else
				e.printStackTrace();
		}		
	}
	/** 
	 * read and parse a log file
	 * @param file http log file
	 * @param filterType type of the search: HTML:   only html pages, 
	 *                                       MAIN:   only main html pages
	 *                                       IMG:    only images
	 *                                       REF:    look in referrer, not in requested ressource
	 *                                       <null>: in all ressource
	 * @param filter set of AND combined search items
	 */
	public void readFile(File file, String filterType, String[] filter) throws InvalidLogEntryException
	{
		Reader rd = null;
		try
		{

			if (file.getName().endsWith(".gz")) {
				FileInputStream fis = new FileInputStream(file);				
				GZIPInputStream zis = new GZIPInputStream(fis); 
				rd = new InputStreamReader(zis);
			}
			else
				rd = new FileReader(file);

			BufferedReader br = new BufferedReader(rd);
			int counter = 0;
			boolean finished = false;
			String line = null;
			String browser = null;
			boolean isSearchEngineStats = filterType.equals(LogEntryFilterTypes.REF);
			
			while (finished == false)
			{
			   // read one line
			   line = br.readLine();
			   if (line==null) finished = true;
			   else
			   {
			   	  counter++;
			   	  LogEntry logEntry = new LogEntry();
				  // parse the line and adjust counter
			   	  browser = LogEntry.parseLine(line, logEntry);
				  
				  // analyze filter
			   	  if (logEntry.matches(filterType, filter))
			   	  {
					nrRealPageOcc++;				   	  
				  	// browser statistics			   	  
				    adjustBrowserStats(browser);

				    if (isSearchEngineStats) // search engine statistics
			   	  	    adjustSearchEngineStats(logEntry);
			   	  	else // page statistics			   	  
			   	  	    adjustPageStats(logEntry);
			   	  	    
					adjustForSingleIP(logEntry);
					adjustDate(logEntry);			   	  	    
			   	  }
			   }
			}
			nrOcc = counter;
		}
		catch (IOException e)
		{
			e.printStackTrace();
		} 
		catch (Exception e)
		{
			if (e instanceof InvalidLogEntryException)
				throw (InvalidLogEntryException)e;
			else 
				throw new InvalidLogEntryException(e.toString());
		} 
		finally {
			try {
				rd.close();
			}
			catch (Exception e) {
				System.err.println("Couldn't close file: " + e.toString());
			}
		}
	}
	private static void showUsage()
	{
		System.out.println("Usage: java de.hawlitzek.logparser.FHDetailHttpLogParser <file> [<filter>]");
		System.out.println("<filter>: [ALL | MAIN | HTML | IMG | REF] (filterString)*");
		System.out.println("version " + VERSION);				
		System.exit(1);
	}
}
