#!/usr/bin/perl -w ############################################################################### #Sampling Tests (Random Dataset) #Copyright © 2004, Leon Bukhman #Last Updated 6/1/04 # # This script tests the performance of Min Support versions of EASE, Biased EA, # and Biased L2. The data source is a randomly generated table located in a # mySQL database. # ############################################################################### use strict; use DBI; use POSIX; #configuration my $generateEASEMinSupportSample = 0; #run EASE (0 or 1) my $generateEAMinSupportSample = 0; #run Biased EA (0 or 1) my $generateL2MinSupportSample = 1; #run Biased L2 (0 or 1) my $generateEASEsample = 0; #run EASE (0 or 1) my $generateBiasedEAsample = 0; #run Biased EA (0 or 1) my $generateBiasedL2sample = 0; #run Biased L2 (0 or 1) my $analyzeSample = 1; #generate freq of items in Sample my $getScore = 1; #show how algorithm performed compared to SRS my $pruneAtBoundary = 1; #prune penalty info at bucket boundaries my $rebuildResultsTable = 1; #rebuild OLAP table with freq of original dataset my $analyzeRandom = 1; #generate freg of items in SRS my $randomSampleIterations = 3; #num times to generate SRS during analyzeRandom my $generateRandomDataset = 1; #generates the table of random itemsets #EASE variables (n=sample size, m=num items, h=num halvings) my ($n, $m, $h) = (155000, 1331, 5); # sample all data #Biased EA and L2 variables my $len = 7; #length of each transaction (2^d-1) my $rate = .03; #sample rate #Lossy Counting variables my $s = .01; #min support my $e = .001; #error #random dataset my $numFreq = 25; #number of frequent cells (d-itemsets) in cube my $noise = .30; #percent of infrequent d-itemsets #memory usage my $mem = 0; #connection to to database (anonymized) my $dsn = "DBI:mysql:database=[dbname];host=[hostname];port=[port]"; my $dbh = DBI->connect($dsn, "[username]", "[password]"); #run program buildIceburgCubeDataset() and print "Random dataset generated\n" if $generateRandomDataset; generateEASEMinSupportSample() and print "EASE MinS sample generated\n" if $generateEASEMinSupportSample; generateEAMinSupportSample() and print "BEA MinS sample generated\n" if $generateEAMinSupportSample; generateL2MinSupportSample() and print "BL2 MinS sample generated\n" if $generateL2MinSupportSample; generateEASEsample() and print "EASE sample generated\n" if $generateEASEsample; generateBiasedEAsample() and print "Biased EA sample generated\n" if $generateBiasedEAsample; generateBiasedL2sample() and print "Biased L2 sample generated\n" if $generateBiasedL2sample; rebuildResultsTable() and print "Results table rebuilt\n" if $rebuildResultsTable; analyzeRandom() and print "Random sample analyzed\n" if $analyzeRandom; analyzeSample() and print "Sample analyzed\n" if $analyzeSample; print "Score was " . getScore() ." ". getScore(1)." ". getScore(2)." ". getScore(3)."\n Mem Usage: ".$mem."\n" if $getScore; #disconnect $dbh->disconnect(); ### Main Code ################################################################# #define the transaction (given a record) sub getTransaction { my $rec = shift; my @items = (0,0,0,0,0,0,0,0); my $D1 = $rec->{'D1'}; #11 choices my $D2 = $rec->{'D2'}; #11 choices my $D3 = $rec->{'D3'}; #11 choices #coordinates in the cube will be $D1*51*51 + $D2*51 + $D3 $items[0]=0; $items[1]=$D1*121; $items[2]=$D2*11; $items[3]=$D3; $items[4]=$D1*121+$D2*11; $items[5]=$D1*121+$D3; $items[6]=$D2*11+$D3; $items[7]=$D1*121+$D2*11+$D3; shift @items; return @items; } sub generateL2MinSupportSample { #clear last run $dbh->do("UPDATE tblRandom SET Sample=0"); #select random sample for EASE my $order = "order by Sort"; my $sth = $dbh->prepare("SELECT * FROM tblRandom $order"); $sth->execute; #prepair variables my $N = 0; # Current transaction my $BucketID; # ID of Current Bucket my @f = (); # Approximate Frequency of item my @r = (); # Sampled Frequency of item my @t = (); # First access time (bucket id) of item #main loop while (my $ref = $sth->fetchrow_hashref) { $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($ref); my ($sum_f, $sum_r)=(0,0); foreach my $i (@items) { $t[$i] = $BucketID if ! $f[$i]; $f[$i]++; $sum_f += $f[$i]; $sum_r += $r[$i] if $r[$i]; } #keep transaction if if ($len/2. + $sum_r <= $rate*$sum_f) { $dbh->do("UPDATE tblRandom SET Sample=1 WHERE ID=$ref->{'ID'}"); foreach my $i (@items) { $r[$i]++; } } #prune at bucket boundaries if ($pruneAtBoundary && $e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<@f; $i++) { $count++ if $f[$i]; if ($f[$i] && $f[$i]+$t[$i]-1<=$BucketID){ delete $f[$i]; delete $r[$i]; delete $t[$i]; } $mem = $count if $count > $mem; } } } #clean up $sth->finish; } sub generateEASEMinSupportSample { #clear last run $dbh->do("UPDATE tblRandom SET Sample=0"); #select random sample for EASE my $order = "order by Sort"; my $sth = $dbh->prepare("SELECT * FROM tblRandom $order limit $n"); $sth->execute; #prepair variables my (@Q1, @Q2) = ((),()); #overall penalties my (@Q1temp, @Q2temp) = ((),());#penalties for current bucket my $N = 0; #current element my $BucketID; #ID of Current Bucket my @keptItems = (); #array of transactions of items kept per bucket #precompute deltas my @delta = (); for (my $i=0; $i<$h; $i++) { $delta[$i] = sqrt( 1-exp( -log(2*$m)/($n/(2**$i)) ) ); } #main loop while (my $ref = $sth->fetchrow_hashref) { $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($ref); #for each halving my $keep = 1; for (my $k=0; $k<$h; $k++) { my $d = $delta[$k]; my $sum = 0; foreach my $i (@items) { if ($Q1[$i][$k]) { #if frequent $sum += ($Q1[$i][$k]-$Q2[$i][$k]); } elsif ($Q1temp[$i][$k]) { #in current bucket $sum += ($Q1temp[$i][$k]-$Q2temp[$i][$k]); } } if ($sum<=0) { #keep it foreach my $i (@items) { if ($Q1[$i][$k]) { #if frequent $Q1[$i][$k] *= (1+$d); $Q2[$i][$k] *= (1-$d); } elsif ($Q1temp[$i][$k]) { #in current bucket only $Q1temp[$i][$k] *= (1+$d); $Q2temp[$i][$k] *= (1-$d); } else { #first time $Q1temp[$i][$k] = (1+$d); $Q2temp[$i][$k] = (1-$d); } } } else { #dont keep it foreach my $i (@items) { if ($Q1[$i][$k]) { #if frequent $Q1[$i][$k] *= (1-$d); $Q2[$i][$k] *= (1+$d); } elsif ($Q1temp[$i][$k]) { #in current bucket only $Q1temp[$i][$k] *= (1-$d); $Q2temp[$i][$k] *= (1+$d); } else { #first time $Q1temp[$i][$k] = (1-$d); $Q2temp[$i][$k] = (1+$d); } } $keep = 0; last; #exit loop on k } } if ($keep) { $dbh->do("UPDATE tblRandom SET Sample=1 WHERE ID=$ref->{'ID'}"); push (@keptItems, [@items]); } #prune at bucket boundaries and update penalties of freq items if ($e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<271; $i++) { for (my $j=0; $j<$h; $j++) { $count++ if $Q1[$i][$j]; $count++ if $Q1temp[$i][$j]; } } $mem = $count if $count > $mem; for my $i (0 .. $#keptItems) { #for each kept transaction my $aref = $keptItems[$i]; for my $j (0 .. $#{$aref}) { #for each item my $item = $aref->[$j]; for (my $k=0; $k<$h; $k++) { #for each halving $Q1[$item][$k] = $Q1temp[$item][$k] if ! $Q1[$item][$k]; $Q2[$item][$k] = $Q2temp[$item][$k] if ! $Q2[$item][$k]; } } } #free up memory @keptItems = (); @Q1temp = () if $pruneAtBoundary; @Q2temp = () if $pruneAtBoundary; } } #clean up $sth->finish; } sub generateEAMinSupportSample { #clear last run $dbh->do("UPDATE tblRandom SET Sample=0"); #select random sample for EASE my $order = "order by Sort"; my $sth = $dbh->prepare("SELECT * FROM tblRandom $order limit $n"); $sth->execute; #prepare variables my (@Q1, @Q2) = ((),()); #overall penalties my (@Q1temp, @Q2temp) = ((),());#penalties for current bucket my $N = 0; #current element my $BucketID; #ID of Current Bucket my @keptItems = (); #array of transactions of items kept per bucket #precompute delta my $delta = sqrt(log(2*$m)/($rate*$n)); #main loop while (my $ref = $sth->fetchrow_hashref) { $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($ref); foreach my $i (@items) { if ($Q1[$i]) { #if frequent $Q1[$i] *= (1-$delta)**$rate; $Q2[$i] *= (1+$delta)**$rate; } elsif ($Q1temp[$i]) { #in current bucket only $Q1temp[$i] *= (1-$delta)**$rate; $Q2temp[$i] *= (1+$delta)**$rate; } else { #first time $Q1temp[$i] = (1-$delta)**$rate; $Q2temp[$i] = (1+$delta)**$rate; } } my $sum = 0; foreach my $i (@items) { if ($Q1[$i]) { #if frequent $sum += ($Q1[$i]-$Q2[$i]); } elsif ($Q1temp[$i]) { #in current bucket $sum += ($Q1temp[$i]-$Q2temp[$i]); } } if ($sum<=0) { #keep it foreach my $i (@items) { if ($Q1[$i]) { #if frequent $Q1[$i] *= (1+$delta); $Q2[$i] *= (1-$delta); } elsif ($Q1temp[$i]) { #in current bucket $Q1temp[$i] *= (1+$delta); $Q2temp[$i] *= (1-$delta); } } $dbh->do("UPDATE tblRandom SET Sample=1 WHERE ID=$ref->{'ID'}"); push (@keptItems, [@items]); } #prune at bucket boundaries and update penalties of freq itemsets if ($e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<271; $i++) { $count++ if $Q1[$i]; $count++ if $Q1temp[$i]; } $mem = $count if $count > $mem; for my $i (0 .. $#keptItems) { #for each kept transaction my $aref = $keptItems[$i]; for my $j (0 .. $#{$aref}) { #for each item my $item = $aref->[$j]; $Q1[$item] = $Q1temp[$item] if ! $Q1[$item]; $Q2[$item] = $Q2temp[$item] if ! $Q2[$item]; } } #free up memory @keptItems = (); @Q1temp = () if $pruneAtBoundary; @Q2temp = () if $pruneAtBoundary; } } #clean up $sth->finish; } sub generateEASEsample { #clear last run $dbh->do("UPDATE tblRandom SET Sample=0"); #select random sample for EASE my $order = "order by Sort"; my $sth = $dbh->prepare("SELECT * FROM tblRandom $order limit $n"); $sth->execute; #prepare variables my (@d, @Q1, @Q2) = ((),(),()); for (my $k=0; $k<$h; $k++) { $d[$k] = sqrt( 1-exp( -log(2*$m)/($n/(2**$k)) ) ); for (my $i=0; $i<$m; $i++) { $Q1[$i][$k] = 1; $Q2[$i][$k] = 1; } } #main loop while (my $ref = $sth->fetchrow_hashref) { my @items = getTransaction($ref); my $keep = 1; #for each halving for (my $k=0; $k<$h; $k++) { my $sum = 0; foreach my $i (@items) { $sum += ($Q1[$i][$k]-$Q2[$i][$k]); } if ($sum<0) { #keep it foreach my $i (@items) { $Q1[$i][$k] *= (1+$d[$k]); $Q2[$i][$k] *= (1-$d[$k]); } } else { #dont keep it foreach my $i (@items) { $Q1[$i][$k] *= (1-$d[$k]); $Q2[$i][$k] *= (1+$d[$k]); } $keep = 0; last; #exit loop on k } } if ($keep) { $dbh->do("UPDATE tblRandom SET Sample=1 WHERE ID=$ref->{'ID'}"); } } #clean up $sth->finish; } sub generateBiasedEAsample { #clear last run $dbh->do("UPDATE tblRandom SET Sample=0"); #select random sample for EASE my $order = "order by Sort"; my $sth = $dbh->prepare("SELECT * FROM tblRandom $order limit $n"); $sth->execute; #prepare variables my $d = sqrt(log(2*$m)/($rate*$n)); my (@Q1, @Q2) = ((), ()); for (my $i=0; $i<$m; $i++) { $Q1[$i] = 1; $Q2[$i] = 1; } #main loop while (my $ref = $sth->fetchrow_hashref) { my @items = getTransaction($ref); foreach my $i (@items) { $Q1[$i] *= (1-$d)**$rate; $Q2[$i] *= (1+$d)**$rate; } my $sum = 0; foreach my $i (@items) { $sum += ($Q1[$i]-$Q2[$i]); } if ($sum<0) { #keep it foreach my $i (@items) { $Q1[$i] *= (1+$d); $Q2[$i] *= (1-$d); } $dbh->do("UPDATE tblRandom SET Sample=1 WHERE ID=$ref->{'ID'}"); } } #clean up $sth->finish; } sub generateBiasedL2sample { #clear last run $dbh->do("UPDATE tblRandom SET Sample=0"); #select random sample for EASE my $order = "order by Sort"; my $sth = $dbh->prepare("SELECT * FROM tblRandom $order limit $n"); $sth->execute; #prepair variables my (@f, @r) = ((), ()); #main loop while (my $ref = $sth->fetchrow_hashref) { my @items = getTransaction($ref); my ($sum_f, $sum_r)=(0,0); foreach my $i (@items) { $f[$i]++; $sum_f += $f[$i]; $sum_r += $r[$i] if $r[$i]; } #keep transaction if if (3.5 + $sum_r <= $rate*$sum_f) { $dbh->do("UPDATE tblRandom SET Sample=1 WHERE ID=$ref->{'ID'}"); foreach my $i (@items) { $r[$i]++; } } } #clean up $sth->finish; } sub generateRandomSample { #clear last run $dbh->do("UPDATE tblRandom SET Random=0"); #generate random sample my $sampleSize = $rate*$n; $dbh->do("UPDATE tblRandom SET Random=1 order by rand() limit $sampleSize"); } sub rebuildResultsTable { #clear the table $dbh->do("DELETE FROM tblRandomOLAP"); #generate itemsets my $sth = $dbh->prepare("INSERT INTO tblRandomOLAP (Dim1, Dim2, Dim3, Ord) " . "VALUES(?,?,?,?)"); for (my $a=0; $a<11; $a++) { for (my $b=0; $b<11; $b++) { for (my $c=0; $c<11; $c++) { my $ord = 3 - ($a == 0) - ($b == 0) - ($c == 0); $sth->execute($a,$b,$c,$ord); } } } #calculate the frequencies ($alg,$where) calculateItemsetFreq("",""); } sub analyzeSample { #clear last run $dbh->do("UPDATE tblRandomOLAP SET freqEASE=0"); #calculate the frequencies ($alg,$where) my $filter = "WHERE Sample=1"; #filter to extact sample calculateItemsetFreq("EASE",$filter); } sub analyzeRandom { #clear last run $dbh->do("UPDATE tblRandomOLAP SET freqRandom=0"); for (my $i=1; $i<=$randomSampleIterations; $i++) { generateRandomSample(); #calculate the frequencies ($alg,$where) my $filter = "WHERE Random=1"; #filter to extact sample calculateItemsetFreq("Random",$filter); print "--Generated random sample $i\n"; } #average the frequencies $dbh->do("UPDATE tblRandomOLAP SET freqRandom=freqRandom/$randomSampleIterations"); } #note frequencies get accumulated sub calculateItemsetFreq { my ($alg,$where) = @_; #get size of recordset my $sth = $dbh->prepare("SELECT count(*) FROM tblRandom $where"); $sth->execute; my $size = ($sth->fetchrow_array)[0]; $sth->finish; my $figures = '.00000'; #freq will be acurate to 2 + $figures places #itemset 0: ANY, ANY, ANY $dbh->do("UPDATE tblRandomOLAP SET freq$alg=freq$alg+1 " . "WHERE Dim1=0 and Dim2=0 and Dim3=0"); #itemset 1: D1, ANY, ANY $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT D1 as dim, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY D1"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.dim and Dim2=0 and Dim3=0"); $dbh->do("DROP TABLE tblA"); #itemset 2: ANY, D2, ANY $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT D2 as dim, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY D2"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=0 and Dim2=tblA.dim and Dim3=0"); $dbh->do("DROP TABLE tblA"); #itemset 3: ANY, ANY, D3 $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT D3 as dim, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY D3"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=0 and Dim2=0 and Dim3=tblA.dim"); $dbh->do("DROP TABLE tblA"); #itemset 4: D1, D2, ANY $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT d1, d2, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY d1, d2"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.d1 and Dim2=tblA.d2 and Dim3=0"); $dbh->do("DROP TABLE tblA"); #itemset 5: D1, ANY, D3 $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT d1, d3, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY d1, d3"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.d1 and Dim2=0 and Dim3=tblA.d3"); $dbh->do("DROP TABLE tblA"); #itemset 6: ANY, D2, D3 $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT d2, d3, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY d2, d3"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=0 and Dim2=tblA.d2 and Dim3=tblA.d3"); $dbh->do("DROP TABLE tblA"); #itemset 7: D1, D2, D3 $dbh->do("CREATE TEMPORARY TABLE tblA " . "SELECT d1, d2, d3, count(*)/$size$figures as freq " . "FROM tblRandom $where GROUP BY d1, d2, d3"); $dbh->do("UPDATE tblRandomOLAP, tblA " . "SET tblRandomOLAP.freq$alg=tblRandomOLAP.freq$alg+tblA.freq " . "WHERE Dim1=tblA.d1 and Dim2=tblA.d2 and Dim3=tblA.d3"); $dbh->do("DROP TABLE tblA"); } #bigger is better (negative score indicates worse performance than SRS) sub getScore { my $ord = shift; my $filter = "WHERE freq>$s "; if ($ord) { $filter .= "and Ord = $ord"; } #distance 1 my $sql = "SELECT ifnull(round(sum(abs(freq-freqRandom))/sum(abs(freq-freqEASE)),4),0) " . "FROM tblRandomOLAP $filter"; #rms distance #my $sql = "SELECT #ifnull(round(sqrt(sum(pow(freq-freqRandom,2)))/sqrt(sum(pow(freq-freqEASE,2))),4),0) " . # "FROM tblRandomOLAP $filter"; my $sth = $dbh->prepare($sql); $sth->execute; my $score = ($sth->fetchrow_array)[0]; $sth->finish; return $score; } sub buildIceburgCubeDataset { $dbh->do("delete from tblRandom"); #clear table my $transactionsGenerated = 0; my @A = (); #array of random numbers $A[0]=0; $A[$numFreq]=1-$noise-$s*$numFreq; #sanity check return 0 if (1-$noise)/$numFreq < $s; #generate random numbers for (my $i=1; $i<$numFreq; $i++) { $A[$i] = rand(1-$noise-$s*$numFreq); } @A = sort @A; #generate frequent cell for (my $i=0; $i<$numFreq; $i++) { my $records = int($n*($s+$A[$i+1]-$A[$i])+.5); #round my ($d1,$d2,$d3) = (int(rand(10))+1, int(rand(10))+1, int(rand(10))+1); my $sql = "insert into tblRandom (D1,D2,D3,Sample,Random,Sort) ". "values ($d1,$d2,$d3,0,0,rand())"; for (my $j=0; $j<$records; $j++) { $dbh->do($sql); } $transactionsGenerated = $transactionsGenerated + $records; } #bring in the noise my $scale = $s<$noise ? $s/$noise : 1; while ($n>$transactionsGenerated) { my $records = int(rand($scale*($n-$transactionsGenerated)))+1; my ($d1,$d2,$d3) = (int(rand(10))+1, int(rand(10))+1, int(rand(10))+1); my $sql = "insert into tblRandom (D1,D2,D3,Sample,Random,Sort) ". "values ($d1,$d2,$d3,0,0,rand())"; for (my $j=0; $j<$records; $j++) { $dbh->do($sql); } $transactionsGenerated = $transactionsGenerated + $records; } return 1; }