Machine Learning/kdd sample
Jump to navigation
Jump to search
# get a random subsample of students from the training set
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
my $numItems=1000;
my $method="random";
my $type="students";
my $help="";
GetOptions ('numitems=s' => \$numItems,
'method=s' => \$method,
'type=s' => \$type,
'h' => \$help);
my $inputFile=shift(@ARGV);
if (not($inputFile)) {
$help=1;
}
my $progname=basename($0);
if ($help) {
print "This program will sample a tab-separated txt file of students.\n";
print "It can be used to get all examples per student (for a number of students).\n";
print "\n";
print "Basic usage:\n";
print "$progname <input file>\n";
print "\n";
print "Full usage:\n";
print "$progname [-numitems <number of items>] [-method <'random'|'first'>] [-type <'students'>] <input file>\n";
print "\n";
print "Examples:\n";
print "$progname algebra_2008_2009_train.txt\n";
print " by default, will create a sample of 1000 random students (all examples on those students)\n";
print "$progname -numitems 20000 algebra_2008_2009_train.txt\n";
print " create a sample of 20000 random students\n";
print "$progname -type students -method first algebra_2008_2009_train.txt\n";
print " create a sample of the first 1000 students\n";
exit(0);
}
print "Type '$progname -h' to get the help\n";
my $directory="download";
if (not(-e $directory)) {
$directory=".";
}
my $outputFile="${inputFile}_sample_${numItems}_${method}_${type}.csv";
print "Getting $numItems $method $type, putting in $outputFile\n";
# get the list of possible ids
my $sourceIdFile="";
my $idIndex=1;
my %names=();
my %sourceIds=();
my @sourceIds=();
if ($type eq "students") {
$sourceIdFile="$directory/studentinfo.csv";
if (not (-e $sourceIdFile)) {
open INPUT, $inputFile;
open OUTPUT, ">$sourceIdFile";
while(defined(my $line = <INPUT>)) {
chomp($line);
my @values=split("\t",$line);
my $id = $values[$idIndex];
if (not(defined($sourceIds{$id}))) {
print OUTPUT "$id\n";
}
$sourceIds{$id} = 1;
}
close OUTPUT;
close INPUT;
@sourceIds = keys %sourceIds;
} else {
open INPUT, $sourceIdFile;
while (defined(my $line=<INPUT>)) {
chomp($line);
push @sourceIds, $line;
}
close INPUT;
}
}
# get the list of ids to pull
my %idsWanted=();
my $numFound=0;
while ($numFound < $numItems) {
my $id=1;
if ($method eq "first") {
$id=shift(@sourceIds);
} else {
my $index=int(rand(scalar(@sourceIds)));
$id=$sourceIds[$index];
# remove that id from the source array
splice(@sourceIds,$index,1)
}
$idsWanted{$id}=1;
$numFound++;
}
print "Pulling $type ids (found " . scalar(keys %idsWanted) . ":\n";
#my @sortedIds=sort(keys(%idsWanted));
print "This could take a while...\n";
# go through the list and pull those lines
open INPUT, $inputFile;
open OUTPUT, ">$outputFile";
# check first line for header
my $line=<INPUT>;
chomp($line);
if ($line =~ /Student Id/) {
print OUTPUT "$line\n";
} else {
print "Er...no header...\n$line\n";
my @values=split(/\t/,$line);
if ($idsWanted{$values[$idIndex]}) {
print OUTPUT "$line\n";
}
}
my $lineNum=1;
# now go through the rest of the lines
while (defined(my $line=<INPUT>)) {
chomp($line);
my @values=split(/\t/,$line);
if ($idsWanted{$values[$idIndex]}) {
print OUTPUT "$line\n";
}
if ($lineNum % 100000 == 0) {
my $percent=100 * $lineNum/8918055;
print "...line $lineNum ($percent %): " . $values[1] . "\n";
}
$lineNum++;
}
close OUTPUT;
close INPUT;
# Do the same for the test file
my $test_input_file = $inputFile;
$test_input_file =~ s/train/test/;
my $output_test_file = "${test_input_file}_sample_${numItems}_${method}_${type}.csv";
open INPUT, $test_input_file;
open OUTPUT, ">$output_test_file";
# check first line for header
$line=<INPUT>;
chomp($line);
if ($line =~ /Student Id/) {
print OUTPUT "$line\n";
} else {
print "Er...no header...\n$line\n";
my @values=split(/\t/,$line);
if ($idsWanted{$values[$idIndex]}) {
print OUTPUT "$line\n";
}
}
$lineNum=1;
# now go through the rest of the lines
while (defined(my $line=<INPUT>)) {
chomp($line);
my @values=split(/\t/,$line);
if ($idsWanted{$values[$idIndex]}) {
print OUTPUT "$line\n";
}
if ($lineNum % 100000 == 0) {
my $percent=100 * $lineNum/508913;
print "...line $lineNum ($percent %): " . $values[1] . "\n";
}
$lineNum++;
}
close OUTPUT;
close INPUT;
exit(0);