#!/usr/bin/perl # Simple short-read simulator (with error-prone sites) # A.P. Jason de Koning, 2010. http://jasondk.org/Teaching.html # Note: these scripts are written for ease of understanding, not elegance. # This script will sample random short reads (in the forward and reverse orientation) for # a given read length, until requested amount of sequence is generated. # In this version, low quality sites are introduced 1% of the time # having a 64% error probability. Errors introduced by the simulator # are annotated as lowercase nucleotides. Quality scores are also set to accurately # identify error prone sites with their error probabilities. # Usage: ./shortReadSimulator.pl readLength megaBasesToSequence templateFile.fasta ouputReadFileName.reads # Output: Reads will be saved to a file with the given name (FASTQ FORMAT) # Get command-line arguments $readLength = shift; $megaBasesToSequence = shift; $fileName = shift; $outputFileName = shift; # Set the output file name and open a file for output $theOutputFileName = ">" . $outputFileName; open(OUT, $theOutputFileName); # Calculate the number of reads we'll sequence, based on user input read length and sequencing amount. $numberOfReadsToSample = int( ($megaBasesToSequence * 1000000) / $readLength ); print "Sampling $fileName $numberOfReadsToSample times with $readLength bp short reads...\n\n"; # Open the file with template sequence to sequence via short-reads open (IN, $fileName) or die "Can't open $fileName\n"; # Create and initialize a variable to hold the entire template sequence $theTemplateSequence = ""; # Read the sequence data into one big string while ($line = ) { # Remove the carriage return at the end of $line chomp($line); if ($line =~ />/) { # This line has a sequence name. Skip. } else { # Remove any spaces, tabs, etc from $line $line =~ s/\s\+//g; # Add this part of the sequence on the end of the template sequence $theTemplateSequence = $theTemplateSequence . $line; } } close IN; # Now theTemplateSequence contains all sequence data read from the file named fileName # We can now proceed to sample random reads # Get the total length of the template sequence string using the length() function $templateLength = length($theTemplateSequence); # Use a for loop to do the simulated sequencing. One iteration of the loop for each read we're generating... for ($i=0; $i<$numberOfReadsToSample; $i++) { # Get a random starting position for the read (making sure that we only get full-length reads) $startPosition = int( rand( ($templateLength - $readLength) ) ); # Get a read by taking a substring out of templateSequence starting at startPosition, of length readLength $read = substr( $theTemplateSequence, $startPosition, $readLength ); # Make all characters uppercase $read = uc($read); # Reverse-complement 50% of the reads if ( rand() <= 0.5 ) { $newRead = ""; # Do the reverse complement for ($j = ( length($read) - 1 ); $j >= 0; $j-- ) { $nucleotide = substr( $read, $j, 1 ); if ( $nucleotide eq 'T' ) { $newRead = $newRead . 'A'; } elsif ( $nucleotide eq 'A' ) { $newRead = $newRead . 'T'; } elsif ( $nucleotide eq 'C' ) { $newRead = $newRead . 'G'; } elsif ( $nucleotide eq 'G' ) { $newRead = $newRead . 'C'; } else { $newRead = $newRead . 'N'; } } $read = $newRead; } # Save the read's string as an array @readAsArray = split('', $read); # Introduce some errors and record quality scores @qualityScores = (); for ($j=0; $j<$readLength; $j++) { # Two-step error introduction: perfect vs error-prone sites # Introduce error-prone sites 1% of the time if ( rand() <= 0.01 ) { # The site will be error prone $readAsArray[$j] = mutateSiteWithErrorProbability( $readAsArray[$j], 0.63 ); # A PHRED score of 2 corresponds to a 63% error probability. Using that. $qualityScores[$j] = chr( (2 + 64) ) ; } else { # A minimal error score is PHRED 40, or error prob of 0.0001. Encoded +64 = 104 $qualityScores[$j] = chr( (40 + 64) ) ; } } # Since the read may have been modified, convert the array back to a string $read = join('', @readAsArray); # print OUT "SIMULATED-READ_". $startPosition . ":1:1:46:" . ($i+1) . "#0:"; #print OUT $read . ":" . join('', @qualityScores) . "\n"; # FASTQ format, see http://en.wikipedia.org/FASTQ_format print OUT "\@SIMULATED-READ_". $startPosition . ":1:1:46:" . ($i+1) . "#0\n"; print OUT $read . "\n"; print OUT "+\n"; print OUT join('', @qualityScores) . "\n"; } close OUT; print "Done.\n"; sub mutateSiteWithErrorProbability { # This subroutine will mutate a given nucleotide state with a given error probability $startingState = shift; $errorProbability = shift; # Get a random number between 0 and 1, call it '$u' $u = rand(); if ($u <= $errorProbability ) { # Need to modify the nucleotide state $newState = getNewNucleotide( $startingState ); } else { # Set the new state equal to the old state (no change) $newState = $startingState; } return $newState; } sub getNewNucleotide { # This subroutine will randomly return a new nucleotide state that is different from its input state # NOTE: if the given state is an 'N' or some other ambiguity code, it will randomly get resolved to a canonical state @nucleotides = ( 'A', 'T', 'C', 'G' ); $currentState = shift; # Loop until we've selected a new state $gotNewState = 0; while ($gotNewState == 0) { $choice = int( rand( 4 ) ); # if the new state is not-equal to ('ne') the starting state, we are done if ( $nucleotides[ $choice ] ne $currentState ) { $gotNewState = 1; } } # Use 'lc', the lowercase function, to set any mutated sites to a lowercase character return lc($nucleotides[ $choice ]); }