#!/usr/bin/perl -w ############################################################################### #MinSupport_Biased-L2 Quick Evaluation #Copyright © 2004, Leon Bukhman #Last Updated 10/1/04 # # This script tests the MinSupport_Biased-L2 algorithm. It runs without the # need of external dependancies (such as a database connection). The dataset # being sampled is automaticaly generated and stored in a text file. # ############################################################################### use strict; use POSIX; ## Configuration Section ###################################################### # select what the script should do my $generateDataset = 1; #create a random iceberg-cube dataset to sample my $generateSample = 1; #generate a sample my $printStats = 1; #show information about results my $cleanUp = 1; #remove the generated files when done my $pruneAtBoundary = 1; #use Lossy Counting approximation # script variables my $datasetPath = "data.txt"; #path to the dataset file (input) my $samplePath = "sample.txt"; #path to the sample file (output) my $numFreq = 15; #number of frequent cells (d-itemsets) in cube my $noise = .01; #percent of infrequent d-itemsets my $n = 155000; #number of records to generate my $s = .01; #min support my $e = .001; #error (should be << min support) my $rate = .03; #sampling rate ## Driver Section ############################################################# buildIceburgCubeDataset() and print "Random dataset generated\n" if $generateDataset; generateL2MinSupportSample() and print "BL2 MinS sample generated\n" if $generateSample; printStats() and print "End of Stats\n" if $printStats; cleanUp() and print "Data files erased\n" if $cleanUp; ## Main Code Section ########################################################## my $mem; #memory usage (maximum bucket size) my $size; #records in resulting sample sub getTransaction { my @rec = split /,/, shift; my @items = (0,0,0,0,0,0,0,0); my $D1 = $rec[0]; #11 choices my $D2 = $rec[1]; #11 choices my $D3 = $rec[2]; #11 choices #coordinates in the cube will be $D1*51*51 + $D2*51 + $D3 $items[0]=0; $items[1]=$D1*121; $items[2]=$D2*11; $items[3]=$D3; $items[4]=$D1*121+$D2*11; $items[5]=$D1*121+$D3; $items[6]=$D2*11+$D3; $items[7]=$D1*121+$D2*11+$D3; shift @items; return @items; } sub generateL2MinSupportSample { #clear last run open DB, "$datasetPath" or die "Can't open $datasetPath : $!"; open OUT,"> $samplePath" or die "Can't open $samplePath : $!"; #prepare variables $mem=0; $size=0; my $N = 0; # Current transaction my $BucketID; # ID of Current Bucket my @f = (); # Approximate Frequency of item my @r = (); # Sampled Frequency of item my @t = (); # First access time (bucket id) of item #main loop while (my $line = ) { chomp $line; $N++; $BucketID = int(ceil($e * $N)); my @items = getTransaction($line); my ($sum_f, $sum_r)=(0,0); foreach my $i (@items) { $t[$i] = $BucketID if ! $f[$i]; $f[$i]++; $sum_f += $f[$i]; $sum_r += $r[$i] if $r[$i]; } #keep transaction if if (3.5 + $sum_r <= $rate*$sum_f) { print OUT "$line\n"; $size++; foreach my $i (@items) { $r[$i]++; } } #prune at bucket boundaries if ($pruneAtBoundary && $e*$N == ceil($e*$N)) { my $count = 0; for (my $i=0; $i<@f; $i++) { $count++ if $f[$i]; if ($f[$i] && $f[$i]+$t[$i]-1<=$BucketID){ delete $f[$i]; delete $r[$i]; delete $t[$i]; } $mem = $count if $count > $mem; } } } close DB; close OUT; } sub buildIceburgCubeDataset { open DB, "> $datasetPath" or die "Can't open $datasetPath : $!"; my $transactionsGenerated = 0; my @A = (); #array of random numbers $A[0]=0; $A[$numFreq]=1-$noise-$s*$numFreq; #sanity check return 0 if (1-$noise)/$numFreq < $s; #generate random numbers for (my $i=1; $i<$numFreq; $i++) { $A[$i] = rand(1-$noise-$s*$numFreq); } @A = sort @A; #generate frequent cell for (my $i=0; $i<$numFreq; $i++) { my $records = int($n*($s+$A[$i+1]-$A[$i])+.5); #round my ($d1,$d2,$d3) = (int(rand(10))+1, int(rand(10))+1, int(rand(10))+1); for (my $j=0; $j<$records; $j++) { print DB "$d1,$d2,$d3\n"; } $transactionsGenerated += $records; } #bring in the noise my $scale = $s<$noise ? $s/$noise : 1; while ($n>$transactionsGenerated) { my $records = int(rand($scale*($n-$transactionsGenerated)))+1; my ($d1,$d2,$d3) = (int(rand(10))+1, int(rand(10))+1, int(rand(10))+1); for (my $j=0; $j<$records; $j++) { print DB "$d1,$d2,$d3\n"; } $transactionsGenerated += $records; } close DB; return 1; } sub printStats { print " >Max bucket size = $mem\n"; print " >Transactions sampled = $size (of $n)\n"; print " >Expected sample size = ".$rate*$n."\n"; } sub cleanUp { unlink $datasetPath; unlink $samplePath; }