<?php
/*
 * squeezedb.php
 *
 * Developed by Clayton Dukes <cdukes@cdukes.com>
 * Copyright (c) 2008 http://nms.gdd.net
 * Licensed under terms of GNU General Public License.
 * All rights reserved.
 *
 * Changelog:
 * 2008-02-08 - created
 * 2008-02-10 - Fixed count bug (db inserts were calculating incorrect count)
 *
 */

/*
   NOTE: Versions of php-syslog-ng 2.9.5d and below will need to alter the database to use this script
   ALTER TABLE logs ADD counter INT NOT NULL DEFAULT 1;
   ALTER TABLE logs ADD fo datetime default NULL;
   ALTER TABLE logs ADD lo datetime default NULL;

   Basic Usage:
   $s = "/USR/SBIN/CRON[10749]: (root) CMD (php /www/php-syslog-ng/scripts/reloadcache.php >> /var/log/php-syslog-ng/reloadcache.log)";
   $s2 = "/USR/SBIN/CRON[10849]: (root) CMD (php /www/php-syslog-ng/scripts/reloadcache.php >> /var/log/php-syslog-ng/reloadcache.log)";
   similar_text($s, $s2, $p);
   echo "Percent: $p%";

   Description:
   This calculates the similarity between two strings as described in Oliver [1993]. 
   Note that this implementation does not use a stack as in Oliver's pseudo code, but 
   recursive calls which may or may not speed up the whole process. Note also that the 
   complexity of this algorithm is O(N**3) where N is the length of the longest string.
   
   Once a match is made, the source row is updated with a "count" of the destination (compared to) row.
   It also logs the "first occurance" and "last occurance" of the message so that you can get an idea
   of how long the message has been repeating.

   Why is this useful?
   Running this script makes a HUGE difference on the amount of data you have to store, and subsequently, search through
   to get to what you really want - an answer to the # 1 question for most customers: Where are my problem children?

   Example:
   When tested on a smaller database, I get the following results:
   Starting Row Count = 12832
   Ending Row Count = 1770
   Cleaned 11062 records saving 86 percent
   Squeeze finished in 1318.39662099 seconds
   (note: I hope to get this script to run faster/more efficiently in the future, if you have ideas, please let me know!)

   Now, we can quickly pull valuable data from the (much smaller!) databse by doing:
   (this will show the top 10 "problem" children")
   mysql> SELECT host,counter,msg from logs WHERE counter>1 ORDER BY counter DESC limit 10;

   +--------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
   | host    | counter | msg
   +--------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
   | host-a |     668 | 466927:         If Cisco determines that your insertion of non-Cisco memory, WIC cards,
   |
   | host-b |     644 | 466928:         AIM cards, Network Modules, SPA cards, GBICs or other modules into a
   |
   | host-c |     614 | 466925:         The module in slot 4 in this router may not be a genuine Cisco product.
   |
   | host-d |     585 | 466926:         Cisco warranties and support programs only apply to genuine Cisco products.
   |
   | host-e |     572 | 466929:         Cisco product is the cause of a support issue, Cisco may deny support under
   |
   | host-f |     385 | 466930:         your warranty or under a Cisco support program such as SmartNet.
   |
   | host-g |     215 | 467114:         your warranty or under a Cisco support program such as SmartNet.
   |
   | host-h |     165 | 32574: %CDP-4-DUPLEX_MISMATCH: duplex mismatch discovered on FastEthernet1/26 (not full duplex), with 7200-3 FastEthernet1/0 (full duplex).  
   |
   | host-i |     133 | /USR/SBIN/CRON[2858]: (root) CMD (php /www/php-syslog-ng/scripts/reloadcache.php >> /var/log/php-syslog-ng/reloadcache.log)
   |
   | host-j |     133 | /USR/SBIN/CRON[2861]: (root) CMD (php /www/rtpnml-xray/htdocs/cacti/poller.php > /dev/null 2>&1)
   |
   +--------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------+
   10 rows in set (0.01 sec)

   As you can see, most of the "spammy" messages are useless and could easily be ignored or deleted (and definitely shouldn't be taking up so many rows!)
   Also note that one of the top issues listed is CDP Duplex (gee there's a surprise) - maybe we should go fix that host/device!

   This is an excellent way (and much faster) to get your Top X problem children with an added benefit of trimming 85-95% of the fat off your database!
 */

$basePath = dirname( __FILE__ );
include_once "$basePath/../html/includes/common_funcs.php";
include_once "$basePath/../config/config.php";

$time_start = get_microtime();
echo "\nStarting Squeeze\n";

// Open connection to DB
$dbLink = db_connect_syslog(DBADMIN, DBADMINPW);

//========================================================================
// START: Set level to match on
// Possible values are 1 to 99.9999999999
// I highly recommend NOT using 1 unless you want to lose all your data...
//========================================================================
$matchpercent = 95;

//------------------------------------------------------------------------
// Grab all rows by sequence # (the primary key) for processing
//------------------------------------------------------------------------
if(defined('COUNT_ROWS') && COUNT_ROWS == TRUE) {
   	$query = "SELECT SQL_CALC_FOUND_ROWS seq from " .DEFAULTLOGTABLE ." ORDER BY seq ASC";
}
else {
   	$query = "SELECT seq from " .DEFAULTLOGTABLE ." ORDER BY seq ASC";
}


$result = perform_query($query, $dbLink);


//------------------------------------------------------------------------
// Get row count
//------------------------------------------------------------------------
if(defined('COUNT_ROWS') && COUNT_ROWS == TRUE) {
	$num_results_array = perform_query("SELECT FOUND_ROWS()", $dbLink);
	$num_results_array = fetch_array($num_results_array);
	$num_rows = $num_results_array[0];
} else {
   	$num_rows = mysql_num_rows($result);
}
echo "$num_rows total rows to process\n";


//------------------------------------------------------------------------
// Begin outer loop
// Loop through each primary key (SEQ) in the table
//------------------------------------------------------------------------
while ($row = fetch_array($result)) {

	//------------------------------------------------------------------------
   	// Query the table again to get the rows for this (SEQ) 
	//------------------------------------------------------------------------
   	$query2 = "SELECT * from " .DEFAULTLOGTABLE ." WHERE seq='" .$row['seq'] ."'";
   	$results2 = perform_query($query2, $dbLink);
   	$row2 = fetch_array($results2);
   	$source_count = 1;
   	$dest_count = 1;

	//------------------------------------------------------------------------
   	// If the row doesn't exist, it's because we deleted it 
	// in this loop already, so we'll just skip to the next row
   	//------------------------------------------------------------------------
   	if ($row2) {
	   	//------------------------------------------------------------------------
	   	// Get 1st comparison (called source in this script)
	   	//------------------------------------------------------------------------
	   	$source_seq = $row2['seq'];
	   	$source_count = $row2['counter'];
	   	//------------------------------------------------------------------------
	   	// The checks below are here in case the FirstOccurance and LastOccurance 
		// columns are NULL (I set them to the datetime entry instead, which would
	   	// be when the log was entered into the database.)
	   	//------------------------------------------------------------------------
	   	if ($row2['fo']) {
		   	$source_fo = $row2['fo'];
	   	} else {
		   	$source_fo = $row2['datetime'];
	   	}
	   	if ($row2['lo']) {
		   	$source_lo = $row2['lo'];
	   	} else {
		   	$source_lo = $row2['datetime'];
	   	}
	   	//------------------------------------------------------------------------
	   	// Below sets the actual string to compare
	   	// I'm doing a comparison on hostname and message, but you can add others.
	  	//------------------------------------------------------------------------
	   	$source = $row2['host'].",".$row2['msg'];
	   	echo "\nSource:[" . $row2['seq'] ."] " . $source ."\n";


		//------------------------------------------------------------------------
	   	// Now we're going to start a 3rd loop inside loop 2 to compare the rows from loops 2 and 3
	   	// Remember that we're in a loop(1) of the current SEQ, so exclude that from the search
	   	//------------------------------------------------------------------------
	   	$query3 = "SELECT * from " .DEFAULTLOGTABLE ." WHERE seq NOT in ('" .$row2['seq'] ."')ORDER BY seq ASC";
	   	$results3 = perform_query($query3, $dbLink);
	   	while($row3 = fetch_array($results3)) {
		   	//------------------------------------------------------------------------
		   	// Get 2nd comparison (called next in this script)
		   	//------------------------------------------------------------------------
		   	$next = $row3['host'].",".$row3['msg'];
		   	$lo = $row3['datetime'];
		   	$dest_count = $row3['counter'];

			//------------------------------------------------------------------------
		   	// Get match and run queries if it's greater than $matchpercent
		   	//------------------------------------------------------------------------
		   	similar_text($source, $next, $p);
		   	if ($p >= $matchpercent) {
			   	$nquery = "UPDATE " .DEFAULTLOGTABLE ." SET counter=" .($source_count + $dest_count) .",fo='$source_fo',lo='$lo' WHERE seq='$source_seq'";
				echo "Source Counter [$source_seq]: $source_count\n";
				echo "Dest Counter [" .$row3['seq'] ."]: $dest_count\n";
				echo "Total: " .($source_count + $dest_count) ."\n";
			   	echo "$nquery\n";
			   	perform_query($nquery, $dbLink);
			   	echo "Match: $p% to [" .$row3['seq'] ."]\n";
			   	echo "Updating [$source_seq] and Deleting [" .$row3['seq'] ."]\n";
			   	$nquery = "DELETE FROM " .DEFAULTLOGTABLE ." WHERE seq='".$row3['seq']."'";
			   	// echo "$nquery\n";
			   	perform_query($nquery, $dbLink);
		   	}
		   	$query4 = "SELECT counter from " .DEFAULTLOGTABLE ." WHERE seq='$source_seq'";
		   	$results4 = perform_query($query4, $dbLink);
		   	$row4 = fetch_array($results4);
		   	$source_count = $row4['counter'];
	   	}
		   	echo "-------------------------------\n";
   	} // else {
   	// echo "Skipping duplicate record...";
   	// }
}

//------------------------------------------------------------------------
// Get a new column count for comparison to original
//------------------------------------------------------------------------
if(defined('COUNT_ROWS') && COUNT_ROWS == TRUE) {
   	$query = "SELECT SQL_CALC_FOUND_ROWS seq from " .DEFAULTLOGTABLE ." ORDER BY seq ASC";
}
else {
   	$query = "SELECT seq from " .DEFAULTLOGTABLE ." ORDER BY seq ASC";
}
$result = perform_query($query, $dbLink);
if(defined('COUNT_ROWS') && COUNT_ROWS == TRUE) {
   	$num_results_array = perform_query("SELECT FOUND_ROWS()", $dbLink);
   	$num_results_array = fetch_array($num_results_array);
   	$num_rows_after = $num_results_array[0];
} else {
   	$num_rows_after = mysql_num_rows($result);
}
//------------------------------------------------------------------------
// Gather and spit out some stats
//------------------------------------------------------------------------
$savings = $num_rows - $num_rows_after;
$savings_p = round( ($savings/$num_rows)*100, 0 );
echo "\nStarting Row Count = $num_rows\n";
echo "Ending Row Count = $num_rows_after\n";
echo "Cleaned $savings records saving $savings_p percent\n";
$time_end = get_microtime();
$exetime = $time_end - $time_start;
echo "Squeeze finished in ".$exetime." seconds\n";
//========================================================================
// END
//========================================================================
?>
