#!/usr/bin/perl -w ############################################################################### #Sampling Tests #Copyright © 2004, Leon Bukhman #Last Updated 6/1/04 # # This script tests the performance of Min Support versions of EASE, Biased EA, # and Biased L2. The data source is a static table located in a mySQL database. # ############################################################################### use strict; use DBI; use POSIX; #configuration my $generateEASEMinSupportSample = 0; #run EASE (0 or 1) my $generateEAMinSupportSample = 0; #run Biased EA (0 or 1) my $generateL2MinSupportSample = 1; #run Biased L2 (0 or 1) my $analyzeSample = 1; #generate freq of items in Sample my $getScore = 1; #show how algorithm performed compared to SRS my $pruneAtBoundary = 1; #prune penalty info at bucket boundaries my $rebuildResultsTable = 0; #rebuild OLAP table with freq of original dataset my $analyzeRandom = 0; #generate freg of items in SRS my $randomSampleIterations = 3; #num times to generate SRS during analyzeRandom #EASE variables (n=sample size, m=num items, h=num halvings) my ($n, $m, $h) = (155340, 270, 5); # sample all data #Biased EA and L2 variables my $len = 7; #length of each transaction (2^d-1) my $rate = .03; #sample rate #Lossy Counting variables my $s = .05; #min support my $e = .005; #error #dimensions my %dim1 = ("A"=>1, "B"=>2); my %dim2 = ("Type1"=>1, "Type2"=>2, "Type3"=>3, "Type4"=>4); my %dim3 = ("1A"=>1, "2A"=>2, "3A"=>3, "4A"=>4, "5A"=>5, "6A"=>6, "7A"=>7, "8A"=>8, "10A"=>9, "10B"=>10, "1B"=>11, "2B"=>12, "5B"=>13, "6B"=>14, "7B"=>15, "8B"=>16, "9B"=>17); #memory usage my $mem = 0; #connection to to database (anonymized) my $dsn = "DBI:mysql:database=[dbname];host=[hostname];port=[port]"; my $dbh = DBI->connect($dsn, "[username]", "[password]"); #run program generateEASEMinSupportSample() and print "EASE sample generated\n" if $generateEASEMinSupportSample; generateEAMinSupportSample() and print "Biased EA sample generated\n" if $generateEAMinSupportSample; generateL2MinSupportSample() and print "Biased L2 sample generated\n" if $generateL2MinSupportSample; rebuildResultsTable() and print "Results table rebuilt\n" if $rebuildResultsTable; analyzeRandom() and print "Random sample analyzed\n" if $analyzeRandom; analyzeSample() and print "Sample analyzed\n" if $analyzeSample; print "Score was " . getScore() ." ". getScore(1)." ". getScore(2)." ". getScore(3)."\n Mem Usage: ".$mem."\n" if $getScore; #disconnect $dbh->disconnect(); ### Main Code ################################################################# #define the transaction (given a record) sub getTransaction { my $rec = shift; my @items = (0,0,0,0,0,0,0,0); my $b = $dim1{$rec->{'Boro'}}; #2 choices my $t = $dim2{$rec->{'Type'}}; #4 choices my $n = $dim3{$rec->{'Network'}}; #17 choices #coordinates in the cube will be $b*5*18 + $t*18 + $n $items[0]=0; $items[1]=$b*90; $items[2]=$t*18; $items[3]=$n; $items[4]=$b*90+$t*18; $items[5]=$b*90+$n; $items[6]=$t*18+$n; $items[7]=$b*90+$t*18+$n; shift @items; return @items; } sub generateL2MinSupportSample { #clear last run $dbh->do("UPDATE tblStructures SET Sample=0"); #select random sample for EASE my $order = ""; # order by rand()"; my $sth = $dbh->prepare("SELECT * FROM tblStructures $order"); $sth->execute; #prepair variables my $N = 0; # Current transaction my $BucketID; # ID of Current Bucket my @f = (); # Approximate Frequency of item my @r = (); # Sampled Frequency of item my @t = (); # First access time (bucket id) of item #main loop while (my $ref = $sth->fetchrow_hashref) { $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($ref); my ($sum_f, $sum_r)=(0,0); foreach my $i (@items) { $t[$i] = $BucketID if ! $f[$i]; $f[$i]++; $sum_f += $f[$i]; $sum_r += $r[$i] if $r[$i]; } #keep transaction if if ($len/2. + $sum_r <= $rate*$sum_f) { $dbh->do("UPDATE tblStructures SET Sample=1 WHERE ID=$ref->{'ID'}"); foreach my $i (@items) { $r[$i]++; } } #prune at bucket boundaries if ($pruneAtBoundary && $e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<@f; $i++) { $count++ if $f[$i]; if ($f[$i] && $f[$i]+$t[$i]-1<=$BucketID){ delete $f[$i]; delete $r[$i]; delete $t[$i]; } $mem = $count if $count > $mem; } } } #clean up $sth->finish; } sub generateEASEMinSupportSample { #clear last run $dbh->do("UPDATE tblStructures SET Sample=0"); #select random sample for EASE my $order = ""; # order by rand()"; my $sth = $dbh->prepare("SELECT * FROM tblStructures $order limit $n"); $sth->execute; #prepair variables my (@Q1, @Q2) = ((),()); #overall penalties my (@Q1temp, @Q2temp) = ((),());#penalties for current bucket my $N = 0; #current element my $BucketID; #ID of Current Bucket my @keptItems = (); #array of transactions of items kept per bucket #precompute deltas my @delta = (); for (my $i=0; $i<$h; $i++) { $delta[$i] = sqrt( 1-exp( -log(2*$m)/($n/(2**$i)) ) ); } #main loop while (my $ref = $sth->fetchrow_hashref) { $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($ref); #for each halving my $keep = 1; for (my $k=0; $k<$h; $k++) { my $d = $delta[$k]; my $sum = 0; foreach my $i (@items) { if ($Q1[$i][$k]) { #if frequent $sum += ($Q1[$i][$k]-$Q2[$i][$k]); } elsif ($Q1temp[$i][$k]) { #in current bucket $sum += ($Q1temp[$i][$k]-$Q2temp[$i][$k]); } } if ($sum<=0) { #keep it foreach my $i (@items) { if ($Q1[$i][$k]) { #if frequent $Q1[$i][$k] *= (1+$d); $Q2[$i][$k] *= (1-$d); } elsif ($Q1temp[$i][$k]) { #in current bucket only $Q1temp[$i][$k] *= (1+$d); $Q2temp[$i][$k] *= (1-$d); } else { #first time $Q1temp[$i][$k] = (1+$d); $Q2temp[$i][$k] = (1-$d); } } } else { #dont keep it foreach my $i (@items) { if ($Q1[$i][$k]) { #if frequent $Q1[$i][$k] *= (1-$d); $Q2[$i][$k] *= (1+$d); } elsif ($Q1temp[$i][$k]) { #in current bucket only $Q1temp[$i][$k] *= (1-$d); $Q2temp[$i][$k] *= (1+$d); } else { #first time $Q1temp[$i][$k] = (1-$d); $Q2temp[$i][$k] = (1+$d); } } $keep = 0; last; #exit loop on k } } if ($keep) { $dbh->do("UPDATE tblStructures SET Sample=1 WHERE ID=$ref->{'ID'}"); push (@keptItems, [@items]); } #prune at bucket boundaries and update penalties of freq items if ($e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<271; $i++) { for (my $j=0; $j<$h; $j++) { $count++ if $Q1[$i][$j]; $count++ if $Q1temp[$i][$j]; } } $mem = $count if $count > $mem; for my $i (0 .. $#keptItems) { #for each kept transaction my $aref = $keptItems[$i]; for my $j (0 .. $#{$aref}) { #for each item my $item = $aref->[$j]; for (my $k=0; $k<$h; $k++) { #for each halving $Q1[$item][$k] = $Q1temp[$item][$k] if ! $Q1[$item][$k]; $Q2[$item][$k] = $Q2temp[$item][$k] if ! $Q2[$item][$k]; } } } #free up memory @keptItems = (); @Q1temp = () if $pruneAtBoundary; @Q2temp = () if $pruneAtBoundary; } } #clean up $sth->finish; } sub generateEAMinSupportSample { #clear last run $dbh->do("UPDATE tblStructures SET Sample=0"); #select random sample for EASE my $order = ""; # order by rand()"; my $sth = $dbh->prepare("SELECT * FROM tblStructures $order limit $n"); $sth->execute; #prepare variables my (@Q1, @Q2) = ((),()); #overall penalties my (@Q1temp, @Q2temp) = ((),());#penalties for current bucket my $N = 0; #current element my $BucketID; #ID of Current Bucket my @keptItems = (); #array of transactions of items kept per bucket #precompute delta my $delta = sqrt(log(2*$m)/($rate*$n)); #main loop while (my $ref = $sth->fetchrow_hashref) { $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($ref); foreach my $i (@items) { if ($Q1[$i]) { #if frequent $Q1[$i] *= (1-$delta)**$rate; $Q2[$i] *= (1+$delta)**$rate; } elsif ($Q1temp[$i]) { #in current bucket only $Q1temp[$i] *= (1-$delta)**$rate; $Q2temp[$i] *= (1+$delta)**$rate; } else { #first time $Q1temp[$i] = (1-$delta)**$rate; $Q2temp[$i] = (1+$delta)**$rate; } } my $sum = 0; foreach my $i (@items) { if ($Q1[$i]) { #if frequent $sum += ($Q1[$i]-$Q2[$i]); } elsif ($Q1temp[$i]) { #in current bucket $sum += ($Q1temp[$i]-$Q2temp[$i]); } } if ($sum<=0) { #keep it foreach my $i (@items) { if ($Q1[$i]) { #if frequent $Q1[$i] *= (1+$delta); $Q2[$i] *= (1-$delta); } elsif ($Q1temp[$i]) { #in current bucket $Q1temp[$i] *= (1+$delta); $Q2temp[$i] *= (1-$delta); } } $dbh->do("UPDATE tblStructures SET Sample=1 WHERE ID=$ref->{'ID'}"); push (@keptItems, [@items]); } #prune at bucket boundaries and update penalties of freq itemsets if ($e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<271; $i++) { $count++ if $Q1[$i]; $count++ if $Q1temp[$i]; } $mem = $count if $count > $mem; for my $i (0 .. $#keptItems) { #for each kept transaction my $aref = $keptItems[$i]; for my $j (0 .. $#{$aref}) { #for each item my $item = $aref->[$j]; $Q1[$item] = $Q1temp[$item] if ! $Q1[$item]; $Q2[$item] = $Q2temp[$item] if ! $Q2[$item]; } } #free up memory @keptItems = (); @Q1temp = () if $pruneAtBoundary; @Q2temp = () if $pruneAtBoundary; } } #clean up $sth->finish; } sub generateRandomSample { #clear last run $dbh->do("UPDATE tblStructures SET Random=0"); #generate random sample my $sampleSize = $rate*$n; $dbh->do("UPDATE tblStructures SET Random=1 order by rand() limit $sampleSize"); } sub rebuildResultsTable { #clear the table $dbh->do("DELETE FROM tblStructuresOLAP"); #arrays of items my @list1 = keys %dim1; unshift(@list1, "ANY"); my @list2 = keys %dim2; unshift(@list2, "ANY"); my @list3 = keys %dim3; unshift(@list3, "ANY"); #generate itemsets my $sth = $dbh->prepare("INSERT INTO tblStructuresOLAP (Dim1, Dim2, Dim3, Ord) " . "VALUES(?,?,?,?)"); foreach my $a (@list1) { foreach my $b (@list2) { foreach my $c (@list3) { my $ord = 3 - ($a eq 'ANY') - ($b eq 'ANY') - ($c eq 'ANY'); $sth->execute($a,$b,$c,$ord); } } } #calculate the frequencies ($alg,$where) calculateItemsetFreq("",""); } sub analyzeSample { #clear last run $dbh->do("UPDATE tblStructuresOLAP SET freqEASE=0"); #calculate the frequencies ($alg,$where) my $filter = "WHERE Sample=1"; #filter to extact sample calculateItemsetFreq("EASE",$filter); } sub analyzeRandom { #clear last run $dbh->do("UPDATE tblStructuresOLAP SET freqRandom=0"); for (my $i=1; $i<=$randomSampleIterations; $i++) { generateRandomSample(); #calculate the frequencies ($alg,$where) my $filter = "WHERE Random=1"; #filter to extact sample calculateItemsetFreq("Random",$filter); print "--Generated random sample $i\n"; } #average the frequencies $dbh->do("UPDATE tblStructuresOLAP SET freqRandom=freqRandom/$randomSampleIterations"); } #note frequencies get accumulated sub calculateItemsetFreq { my ($alg,$where) = @_; #get size of recordset my $sth = $dbh->prepare("SELECT count(*) FROM tblStructures $where"); $sth->execute; my $size = ($sth->fetchrow_array)[0]; $sth->finish; my $figures = '.00000'; #freq will be acurate to 2 + $figures places #itemset 0: ANY, ANY, ANY $dbh->do("UPDATE tblStructuresOLAP SET freq$alg=freq$alg+1 " . "WHERE Dim1='ANY' and Dim2='ANY' and Dim3='ANY'"); #itemset 1: Boro, ANY, ANY $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Boro as dim, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Boro"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.dim and Dim2='ANY' and Dim3='ANY'"); $dbh->do("DROP TABLE tblA"); #itemset 2: ANY, Type, ANY $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Type as dim, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Type"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1='ANY' and Dim2=tblA.dim and Dim3='ANY'"); $dbh->do("DROP TABLE tblA"); #itemset 3: ANY, ANY, Ntwk $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Network as dim, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Network"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1='ANY' and Dim2='ANY' and Dim3=tblA.dim"); $dbh->do("DROP TABLE tblA"); #itemset 4: Boro, Type, ANY $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Boro d1, Type d2, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Boro, Type"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.d1 and Dim2=tblA.d2 and Dim3='ANY'"); $dbh->do("DROP TABLE tblA"); #itemset 5: Boro, ANY, Ntwk $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Boro d1, Network d2, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Boro, Network"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.d1 and Dim2='ANY' and Dim3=tblA.d2"); $dbh->do("DROP TABLE tblA"); #itemset 6: ANY, Type, Ntwk $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Type d1, Network d2, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Type, Network"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1='ANY' and Dim2=tblA.d1 and Dim3=tblA.d2"); $dbh->do("DROP TABLE tblA"); #itemset 7: Boro, Type, Ntwk $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT Boro d1, Type d2, Network d3, count(*)/$size$figures as freq " . "FROM tblStructures $where GROUP BY Boro, Type, Network"); $dbh->do("UPDATE tblStructuresOLAP, tblA " . "SET tblStructuresOLAP.freq$alg=tblStructuresOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.d1 and Dim2=tblA.d2 and Dim3=tblA.d3"); $dbh->do("DROP TABLE tblA"); } #bigger is better (negative score indicates worse performance than SRS) sub getScore { my $ord = shift; my $filter = "WHERE freq>$s "; if ($ord) { $filter .= "and Ord = $ord"; } #distance 1 my $sql = "SELECT ifnull(round(sum(abs(freq-freqRandom))/sum(abs(freq-freqEASE)),4),0) " . "FROM tblStructuresOLAP $filter"; #rms distance #my $sql = "SELECT #ifnull(round(sqrt(sum(pow(freq-freqRandom,2)))/sqrt(sum(pow(freq-freqEASE,2))),4),0) " . # "FROM tblStructuresOLAP $filter"; my $sth = $dbh->prepare($sql); $sth->execute; my $score = ($sth->fetchrow_array)[0]; $sth->finish; return $score; }