#!/usr/bin/perl # Simple short-read simulator # A.P. Jason de Koning, 2010. http://jasondk.org/Teaching.html # This script will sample random forward-strand short reads for # a given read length, until requested amount of sequence is generated. # In this version, quality data is ignored and reads are perfect samples # taken from the template (i.e., are error free). # Usage: ./shortReadSimulator_perfectReads.pl readLength megaBasesToSequence templateFile.fasta ouputReadFileName.reads # Output: Reads will be saved to a file with the given name # Get command-line arguments $readLength = shift; $megaBasesToSequence = shift; $fileName = shift; $outputFileName = shift; # Set the output file name and open a file for output $theOutputFileName = ">" . $outputFileName; open(OUT, $theOutputFileName); # Calculate the number of reads we'll sequence, based on user input read length and sequencing amount. $numberOfReadsToSample = int( ($megaBasesToSequence * 1000000) / $readLength ); print "Sampling $fileName $numberOfReadsToSample times with $readLength bp short reads...\n"; # Open the file with template sequence to sequence via short-reads open (IN, $fileName) or die "Can't open $fileName\n"; # Initialize the random number generator srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip`); # Create and initialize a variable to hold the entire template sequence $theTemplateSequence = ""; # Read the sequence data into one big string while ($line = ) { # Remove the carriage return at the end of $line chomp($line); if ($line =~ />/) { # This line has a sequence name. Skip. } else { # Remove any spaces, tabs, etc from $line $line =~ s/\s\+//g; # Add this part of the sequence on the end of the template sequence $theTemplateSequence = $theTemplateSequence . $line; } } close IN; # Now theTemplateSequence contains all sequence data read from the file named fileName # We can now proceed to sample random reads # Get the total length of the template sequence string using the length() function $templateLength = length($theTemplateSequence); # Use a for loop to do the simulated sequencing. One iteration of the loop for each read we're generating... for ($i=0; $i<$numberOfReadsToSample; $i++) { # Get a random starting position for the read (making sure that we only get full-length reads) $startPosition = int( rand( ($templateLength - $readLength) ) ); # Get a read by taking a substring out of templateSequence starting at startPosition, of length readLength $read = substr( $theTemplateSequence, $startPosition, $readLength ); # Make all characters uppercase using uc() function uc($read); # Introduce some errors and record quality scores @qualityScores = (); for ($j=0; $j<$readLength; $j++) { # We will simply generate a quality score string indicating maximum quality $qualityScores[$j] = 'h'; } print OUT "SIMULATED-READ_". $startPosition . ":1:1:46:" . ($i+1) . "#0:"; print OUT $read . ":" . join('', @qualityScores) . "\n"; } close OUT; print "Done.\n";