#!/usr/bin/perl -w # takes BRLMM calls for 50K, 250K Affymetrix SNP chip and generates either MERLIN or ALLEGRO input files for analysis # takes options for these and options on how to select marker subsets use strict; use warnings; use Getopt::Long; my $snpchoice; my $pedfile; my $whichsamplesfile; my $callfile; my $chip; my $binsize; my $outputdir; my $annot_dir; my $brlmm; my $prog; my $merr; my $uninf; my $mindist; GetOptions("annot_dir=s"=>\$annot_dir,"brlmm"=>\$brlmm,"snpchoice=i"=>\$snpchoice,"pedfile=s"=>\$pedfile,"prog=s"=>\$prog, "whichsamplesfile=s"=>\$whichsamplesfile,"callfile=s"=>\$callfile,"chip=i"=>\$chip,"binsize=f"=>\$binsize,"outputdir=s",=>\$outputdir, "merr",=>\$merr,"uninf",=>\$uninf,"mindist",=>\$mindist); # necessary options are random, pedfile, whichsamples, and chip # random - either 1 = highest het in int, or 0 = random choice # pedfile - the pedigree file with 6 columns: pedid/indid/father's id/mother's id/sex/affectedness status. It contains individuals ids as used in whichsamples, # whichsamplesfile - a file containing just one line with of length = # of individuals in the pedfile. An individual in the pedigree is 0 if there is no sample available, and # otherwise one of {1,....,samples in brlmm file} so that the right individuals in the brlmm file are used in the pedigree. If there are pedigree errors then it's worthwhile # double checking this file in particular. # brlmmfile - is the file from brlmm containing the calls # chip - is one of 6 options: 50KXba, 50KHind, 250KSty, 250KNsp, 5.0 and 6.0 chips # other (non-necessary options) are: # $binsize (default is set at 0.5 (cM)) which selects approximately 7000 SNPs if(!defined($snpchoice)||!defined($callfile)||!defined($pedfile)||!defined($whichsamplesfile)||!defined($chip)||!defined($annot_dir)||!defined($prog)){ print "Usage: linkdatagen.pl -pedfile -whichsamplesfile -callfile -chip {1,..,6} -snpchoice {0,1} -annot_dir annot_dir -prog {me/al/mo/pl/pr} [binsize outputdir merr uninf]\n\n"; print "Melanie Bahlo, Bioinformatics Division, The Walter and Eliza Hall Institute of Medical Research, Last updated September 2008.\n\n"; print "\t - the pedigree file with 6 columns: pedid/indid/father's id/mother's id/sex/affectedness status. It contains individuals ids as used in \n\n"; print "\t - a file containing just one line of length = # of individuals in the pedfile. An individual in the pedigree is 0 if there is no sample available, and "; print "\totherwise one of {1,....,samples} in brlmm file so that the right individuals in the brlmm file are used in the pedigree. If there are pedigree errors then it's worthwhile double checking this file in particular.\n\n"; print "\t - genotyping data is generated by brlmm (typically brlmm.calls.txt), if not defined is from crlmm (typically crlmm.calls.txt) and contains the genotype calls.\n\n"; print "\tbrlmm = specify that genotype calls are from brlmm, otherwise crlmm calls are assumed.\n\n"; print "\tbrlmm genotypes are one of {-1,0,1,2} corresponding to {missing,AA,AB,BB}\n"; print "\tcrlmm genotypes are one of {0,1,2,3} corresponding to {missing, AA,AB,BB}\n"; print "\tannot_dir - is the directory with the annotation files from Affymetrix. You can download these files (*.annot.csv) from www.affymetrix.org.\n\n"; print "\tchip - is one of 6 options: 1=50KXba, 2=50KHind, 3=250KSty, 4=250KNsp, 5=5.0, 6=6.0\n"; print "\tprog [all/al/me/mo/pr] - programchoice - choices are: -all for all (MERLIN, ALLEGRO, PREST, MORGAN, PLINK), pr for PREST, al for ALLEGRO, me for MERLIN, mo for MORGAN and pl for PLINK.\n"; print "\tsnpchoice - either 1 = highest het in SNP interval (size defined by binsize), or 0 = random choice\n\n"; print "Optional parameters\n\n"; print "\tbinsize - size of bin, in cM, from which to choose each snp. Default is 0.5 cM. To include all SNPs set binsize to 0\n"; print "\toutputdir - name of dir into which all -prog files will be written, e.g. _/.\n"; print "\tmerr - Specify if markers found to harbour Mendelian Errors are NOT to be removed from dataset.\n"; print "\tmindist - Turn off the minimumum distance between markers requirement. This is set to either >0.2 cM or 0.5*binsize whichever is smaller\n"; exit(1); } if(!defined($binsize)){ $binsize=0.5; # size of bin from which to extract ONE marker, to be made into an option. used to reduce size of files. print "Binsize has been set to default value of 0.5\n"; } else{ print "Binsize has been set to $binsize.\n"; } if(!defined($mindist)){ if($binsize*0.5>0.2){ $mindist=0.2; } else{ $mindist=$binsize*0.5; } print "Markers will be forced to be at least "; printf "%1.2f", $mindist; print " apart.\n"; } else{ $mindist=0; # defining mindist print "Markers will be forced to be at least "; printf "%1.2f", $mindist; print " apart.\n"; print "WARNING: Markers may still occasionally be very closely spaced.\n"; } my $merlindir; my $allegrodir; my $prestdir; my $morgandir; my $plinkdir; my $mbdir; if(!defined($outputdir)){ if($prog eq "all"||$prog eq "al"){ $allegrodir="allegro"; # default name for allegro dir print "The output dir for the ALLEGRO files will be allegro/\n"; } if($prog eq "all"||$prog eq "me"){ $merlindir="merlin"; # default name for merlin dir print "The output dir for the MERLIN files will be merlin/\n"; } if($prog eq "all"||$prog eq "pr"){ $prestdir="prest"; # default name for prest dir print "The output dir for the PREST files will be prest/\n"; } if($prog eq "all"||$prog eq "mo"){ $morgandir="morgan"; # default name for morgan dir print "The output dir for the MORGAN files will be morgan/\n"; } if($prog eq "all" || $prog eq "pl"){ $plinkdir="plink"; #default name print "The output dir for the PLINK files will be plink/\n"; } if($prog eq "mb"){ $plinkdir="mb"; #default name print "The output dir for the mb (INTERNAL) files will be mb/\n"; } } else{ print "Any output files will be written to ",$outputdir,"_"; $allegrodir=$outputdir."_allegro"; $merlindir=$outputdir."_merlin"; $morgandir=$outputdir."_morgan"; $prestdir=$outputdir."_prest"; $plinkdir=$outputdir."_plink"; $mbdir=$outputdir."_mb"; } if(!defined($brlmm)){ print "CRLMM genotype calls.\n"; } else{ print "BRLMM genotype calls.\n"; } print "Annotation dir is $annot_dir\n"; # dealing with getting the right annotation file my $chipfile; SWITCH:{ if($chip==1){ $chipfile=$annot_dir."/Mapping50K_Xba240.na22.annot.csv"; print "Opening annotation file $chipfile...\n"; last SWITCH; } if($chip==2){ $chipfile=$annot_dir."/Mapping50K_Hind240.na22.annot.csv"; print "Opening annotation file $chipfile...\n"; last SWITCH; } if($chip==3){ $chipfile=$annot_dir."/Mapping250K_Sty.na21.annot.csv"; print "Opening annotation file $chipfile...\n"; last SWITCH; } if($chip==4){ $chipfile=$annot_dir."/Mapping250K_Nsp.na21.annot.csv"; print "Opening annotation file $chipfile...\n"; last SWITCH; } if($chip==5){ $chipfile=$annot_dir."/GenomeWideSNP_5.na25.annot.csv"; print "Opening annotation file $chipfile...\n"; last SWITCH; } if($chip==6){ $chipfile=$annot_dir."/GenomeWideSNP_6.na25.annot.csv"; print "Opening annotation file $chipfile...\n"; last SWITCH; } print "Error: chip must be one of 1, 2, 3, 4, 5 or 6\n"; } # -----global vars------ my %genos_orig=(); # hash of arrays with hash containing ALL SNPs and array containing genotypes of people my %annot_orig=(); # hash of arrays with hash containting ALL SNP ids and array containing annotation data my %genos=(); # hash of arrays with hash containing chosen subset of SNPs and array containing genotypes of people #my %annot=(); # hash of arrays with hash containting chosen subset of SNP ids and array containing annotation data my @ped; # pedigree array #my @whichsamples=(0,0,0,0,0,0,0,0,14,12,11,13); # these are the samples that need to be put in the .pre file, others are going to be missing my @whichsamples=(); my $pedno=0; # no of individuals in the pedigree #my $snpchoice=1; # either 0 or 1, 0 = random choice SNP, 1 = highest het in interval my @chr_snp_cnt_orig=(); # numbers of Snps on each chromosome for ALL Snps my @chr_order_snp_orig=(); # array (chr) of array (genetic map) of array (all annot data) of snps ordered by 1) chromosome and 2) genetic map pos my @chr_snp_cnt=(); # numbers of Snps on each chromosome for subset of snps my @chr_order_snp=(); # array (chr) of array (genetic map) of array (all annot data) of snps ordered by 1) chromosome and 2) genetic map pos my @tempMB; my $pfile; my $command; my %bin_het_hash=(); my @which=(); @tempMB=split(/\./,$pedfile); $pfile=$tempMB[0]; # pedigree name # create output file dirs if($prog eq "all"||$prog eq "me"){ my $command="mkdir ".$merlindir; system($command); } if($prog eq "all"||$prog eq "al"){ $command="mkdir ".$allegrodir; system($command); } if($prog eq "all"||$prog eq "mo"){ $command="mkdir ".$morgandir; system($command); } if($prog eq "all"||$prog eq "pr"){ $command="mkdir ".$prestdir; system($command); } if($prog eq "all"||$prog eq "pl"){ $command="mkdir ".$plinkdir; system($command); } if($prog eq "mb"){ $command="mkdir ".$mbdir; system($command); } my $line; my @temp; my %temp_chr_hash=(); # order of the opening of these files is IMPORTANT!!!!! #read in the whichsamples file open(IN4,"<$whichsamplesfile") || die print "Can't open whichsamples file\n"; my @arr4; @arr4=; print "Reading in the whichsamples file...\n"; read_in_whichsamples(); undef(@arr4); close(IN4); print "Reading in the pedigree data...\n"; open(IN3,"<$pedfile") || die print "Can't open pedigree file\n"; # pedigree file my @arr3; @arr3=; read_in_ped(); undef(@arr3); close(IN3); # input data manipulation open(IN1,"<$callfile") || die print "Can't open brlmm_calls file\n"; my @arr1; @arr1=; print "Reading in the genotyping data...\n"; read_in_brlmm(); undef(@arr1); close(IN1); print "Reading in the genome annotation data...\n"; open(IN2,"<$chipfile") || die print "Can't open annotation file $chipfile\n";# annotation file with cM, all freqs, hets etc read_in_annot(); close(IN2); if($chip>4){ check_geno_markers(); } # more marker selection # get rid of all uninformative markers in the family (assume it's a family) if(defined($uninf)){ remove_uninf(); } # error checks - mainly for Allegro & MORGAN, merlin knocks them out itself if(!defined($merr)){ mendelian_errors(); } print "Splitting the annotation data over the different chromosomes and ordering by genetic map position...\n"; split_snps_to_chromos(); #print_ordered_snps(); # functions to reduce datafiles to fewer markers select_per_bin(); # make this with two options 1) random, 2) best - in the sense of highest het. # test --------- #print_genos_test(); undef(%genos_orig); undef(%annot_orig); # output files # merlin files if($prog eq "all" || $prog eq "me"){ print "Printing out merlin .pre files..\n"; print_pre_merlin(); print "Printing out merlin .map files...\n"; print_map_merlin(); print "Printing out merlin .freq files....\n"; print_freq_merlin(); print "Printing out merlin .dat files...\n"; print_dat_merlin(); print "Printing out merlin .in files with which to run each chromosome...\n"; print_merlin_run_files(); } # allegro files if($prog eq "all" || $prog eq "al"){ print "Printing out allegro .pre files..\n"; print_pre_allegro(); print "Printing out allegro .dat files..\n"; print_dat_allegro(); print "Printing out allegro .in files..\n"; print_allegro_run_files(); } # prest files if($prog eq "all"|| $prog eq "pr"){ print "Printing .idx files for PREST\n"; print_PREST_idx(); } # Morgan files if($prog eq "all"|| $prog eq "mo"){ print "Printing .par, .map and .ped files for MORGAN\n"; print_MORGAN(); } if($prog eq "all" || $prog eq "pl"){ print "Printing out .map and .ped file for PLINK\n"; print_PLINK(); } if($prog eq "mb"){ print "Printing out the genotypes in MB style\n"; print_MB(); } # ---------sub routines-------------- # ------------------------ read in of all the data ------------------------------------------- sub read_in_whichsamples{ @whichsamples=split(/\s+/,$arr4[0]); print "whichsamples are: @whichsamples\n"; } sub read_in_brlmm{ my $i; my $line_cnt=0; my $j; my $str="SNP_A"; for($i=0;$i<=$#whichsamples;$i++){ if($whichsamples[$i]!=0){ push(@which,$whichsamples[$i]); # which gives the nth sample which is relevant to include in the pedigree ## +1 because SNP is also in this line so counting from 1 to no_samples in whichsamples is fine and does not need to be adjusted } # which is of size < whichsampels as all missing inds are not included } foreach $line (@arr1){ if($line=~/$str/){ # only selecting markers with SNP_A @temp=split(/\s+/,$line); $j=0; for($i=0;$i<=$#which;$i++){ $genos_orig{$temp[0]}[$j]=$temp[$which[$i]]; # genos_orig has for each SNP an entry for each genotype in the order of the samples in the whichsamplesfile if($line_cnt<10){ print "$genos_orig{$temp[0]}[$j]\t"; } $j+=1; } if($line_cnt<10){ print "\n"; } $line_cnt+=1; if($line_cnt%10000==0){ print "Read in $line_cnt genotyping lines\n"; } } } print "Total number of SNPs in rlmm file =$line_cnt\n"; } sub read_in_ped{ my $i; my $j; if($arr3[0]!~/\d+/){ print "Deleted first line of pedigree file as it is a header line.\n"; shift(@arr3); } foreach $line (@arr3){ @temp=split(/\s+/,$line); if($#temp!=5){ print "Error in pedigree file.\n"; exit(1); } for($i=0;$i<=$#temp;$i++){ $ped[$pedno][$i]=$temp[$i]; } $pedno+=1; } for($i=0;$i<$pedno;$i++){ for($j=0;$j<6;$j++){ print "$ped[$i][$j]\t"; } print "\n"; } if($pedno!=($#whichsamples+1)){ print "Check your whichsamples file and your pedigree file.\n"; print "No of lines in the pedigree file = $pedno, No of entries in the whichsamples file = ",$#whichsamples+1,".\n"; print "There are too many/too few samples for the pedigree in the file.\n"; exit(1); } } # lines in annotation file when split on , # 1 - "SNP_A-1780619" # 2 - "10004759" # 3 - "rs17106009" # 4 - "1" # 5 - "NCBI Build 36.1, March 2006" # 6 - "126, May 2006" # 7 - "50433725" # 8 - "-" # 9 - "0" # 10 - "p33" # 11 - "ggatattgtgtgagga[A/G]taagcccacctgtggt" # 12 - "A" # 13 - "G" # 14 - "NM_021952 // intron // 0 // Hs.213050 // ELAVL4 // 1996 // Homo sapiens ELAV (embryonic lethal, abnormal vision, Drosophila)-like 4 (Hu antigen D) (ELAVL4), mRNA. /// ENST00000371827 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000357083 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000323186 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000371824 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000371823 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000361667 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000371821 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378] /// ENST00000371819 // intron // 0 // --- // --- // --- // ELAV-like protein 4 (Paraneoplastic encephalomyelitis antigen HuD) (Hu-antigen D). [Source:Uniprot/SWISSPROT;Acc:P26378]" # 15 - "72.030224900657 // D1S2824 // D1S197 // --- // --- /// 76.2778636775225 // D1S2706 // D1S2661 // AFMA337WC1 // AFMA203YE9 /// 68.1611616801535 // --- // --- // 59969 // 770243" # 16 - "D1S1559 // downstream // 51841 /// D1S2299E // upstream // 6915" # 17 - "574 // 50433477 // 50434050" # 18 - "0.0 // 1.0 // Chinese /// 0.0 // 1.0 // Japanese" # 19 - "0.022222 // 0.977778 // Yoruba" # 20 - "0.010204 // 0.989796 // Caucasian" # 21 - "0.0 // Chinese /// 0.0 // Japanese" # 22 - "0.043457 // Yoruba" # 23 - "0.0202 // Caucasian" # 24 - "45.0 /// 45.0" # 25 - "45.0" # 26 - "49.0" # for 6.0 chip # 1"Probe Set ID" # 2 "Affy SNP ID" # 3 "dbSNP RS ID" # 4 "Chromosome" # 5 "Physical Position" # 6 "Strand" # 7 "ChrX pseudo-autosomal region 1" # 8 "Cytoband" # 9 "Flank" #10 "Allele A" # 11 "Allele B" # 12 "Associated Gene" # 13 "Genetic Map" # 14 "Microsatellite" # 15 "Fragment Enzyme Type Length Start Stop" #16 "Allele Frequencies" # 17 "Heterozygous Allele Frequencies" # 18 "Number of individuals/Number of chromosomes" # 19 "In Hapmap" #20 "Strand Versus dbSNP" # 21 "Copy Number Variation" # 22 "Probe Count" # 23"ChrX pseudo-autosomal region 2" # 24"In Final List" # 25"Minor Allele" # 26 "Minor Allele Frequency" sub read_in_annot{ my $i; my $line; my $line_cnt=0; my @temp1=(); my @temp2=(); my $snpprobcnt=0; my $thingtomatch=","; my $missthismarker=0; my $key; my $test=0; my $totalline_cnt=0; if($chip<5){ $line=readline(IN2); } else{ for($i=0;$i<=15;$i++){ $line= readline(IN2); #splice(@arr2,0,15); # gets rid of first 15 lines for larger annotation files. } } if($chip>2){ # special case for the 250K Affymetrix annotation files LINE:while ($line=readline(IN2)){ $totalline_cnt+=1; @temp=(); $test=0; @temp=split(/","/,$line); # simple check: #print "temp[0]=$temp[0], temp[1]=$temp[1],temp[2]=$temp[2]\n"; #print "temp[3]=$temp[3],temp[4]=$temp[4],temp[5]=$temp[5]\n"; for($i=0;$i<=$#temp;$i++){ $temp[$i]=~tr/"//d; } #print "temp[0] = $temp[0], temp[3]=$temp[3]\n temp[9]=$temp[9], temp[12]=$temp[12],\n temp[15]=$temp[15]\n"; #print "temp[16]=$temp[16]\n"; #exit(1); # since there is an awful lot of data in the annotation file we are only going to store select bits and pieces. # 1, 4, 12,15 (needs further splitting by //, then 1st entry), 20 (needs further splitting by //, then 1st entry), 23 (needs further splitting by //, then 1st entry) if($chip>4){ if(!defined($temp[3])){ $snpprobcnt+=1; next LINE; } else{ if($temp[3]!~/\d|X/){ #print "Problem with chromosome: $temp[3]\n"; #print "$snpprobcnt : @temp"; $snpprobcnt+=1; next LINE; } } @temp1=split(/\/+/,$temp[12]); # decode gm data #print "temp1 from 12 = @temp1\n"; if($temp1[0]!~/\d+\.\d+/){ #print "Problem with map pos: $temp1[0]\n"; #print "@temp\n"; $snpprobcnt+=1; next LINE; } @temp2=split(/\/+/,$temp[15]); #all freqs cauc #print "temp2 =@temp2, from line 15 = $temp[15]\n"; if($temp2[2]=~/H/){ # strange swap in annotation files from listing of chinese then caucasian to caucasian then chinese, picked up by Catherine. if($temp2[3]<0.0){ print "Problem with allele freq: $temp2[3]\n"; print "@temp\n"; $snpprobcnt+=1; next LINE; } } else{ if($temp2[0]<0.0){ print "Problem with allele freq: $temp2[0]\n"; print "@temp\n"; $snpprobcnt+=1; next LINE; } } if(!defined(@genos_orig{$temp[0]})){ #print "Marker $temp[0] is not in the genotyping datafile\n"; $missthismarker+=1; next LINE; } if($temp[3]!~/\d|X/){ print "problem here: chr=$temp[3]\n"; exit(1); } $annot_orig{$temp[0]}[0]=$temp[0]; # snp name $annot_orig{$temp[0]}[1]=$temp[3]; # chromosome $annot_orig{$temp[0]}[2]=$temp[9]; # allelele A $annot_orig{$temp[0]}[3]=$temp1[0]; # sex averaged gm position if($temp2[2]=~/H/){ # strange swap in annotation files from listing of chinese then caucasian to caucasian then chinese, picked up by Catherine. $annot_orig{$temp[0]}[4]=$temp2[3]; # all freq allele A cauc } else{ $annot_orig{$temp[0]}[4]=$temp2[0]; # all freq allele A cauc } $annot_orig{$temp[0]}[5]=2*$temp2[3]*(1-$temp2[3]); $annot_orig{$temp[0]}[6]=$temp[2]; # rs name $annot_orig{$temp[0]}[7]=$temp[4]; # genomic pos $line_cnt+=1; } else{ $annot_orig{$temp[0]}[0]=$temp[0]; # snp name $annot_orig{$temp[0]}[1]=$temp[3]; # chromosome $annot_orig{$temp[0]}[2]=$temp[11]; # allelele A @temp1=split(/\/+/,$temp[14]); # decode gm data #print "temp1 from 14 = @temp1\n"; $annot_orig{$temp[0]}[3]=$temp1[0]; # sex averaged gm position @temp1=split(/\/+/,$temp[19]); #all freqs cauc #print "temp1 from 19 = @temp1\n"; $annot_orig{$temp[0]}[4]=$temp1[0]; # all freq allele A cauc @temp1=split(/\/+/,$temp[22]); # het caucasian #print "temp1 from 22 = @temp1\n"; $annot_orig{$temp[0]}[5]=$temp1[0]; #print "annotation line = @{$annot{$temp[0]}}\n"; $annot_orig{$temp[0]}[6]=$temp[2]; # rs name $annot_orig{$temp[0]}[7]=$temp[6]; # genomic pos $line_cnt+=1; } if($line_cnt%100000==0){ print "Read in $line_cnt SNP annotation lines\n"; #exit(1); } } } else{ # special case necessary for the 50K chips which have different Affymetrix annotation files while ($line =readline(IN2)){ @temp=split(/","/,$line); # simple check: for($i=0;$i<=$#temp;$i++){ $temp[$i]=~tr/"//d; } #print "temp[0] = $temp[0]\n temp[3]=$temp[3]\n temp[11]=$temp[11]\n"; # since there is an awful lot of data in the annotation file we are only going to store select bits and pieces. # 1, 4, 12,15 (needs further splitting by //, then 1st entry), 20 (needs further splitting by //, then 1st entry), 23 (needs further splitting by //, then 1st entry) $annot_orig{$temp[0]}[0]=$temp[0]; # snp name $annot_orig{$temp[0]}[1]=$temp[3]; # chromosome $annot_orig{$temp[0]}[2]=$temp[11]; # allelele A @temp1=split(/\/+/,$temp[14]); # decode gm data #print "temp1[0] from 14 (decode gm data) = $temp1[0]\n"; $annot_orig{$temp[0]}[3]=$temp1[0]; # sex averaged gm position @temp1=split(/\/+/,$temp[17]); #all freqs cauc #print "temp1=@temp1, vector with all freqs\n"; #print "temp1[6] from 17 (all freq cauc) = $temp1[6]\n"; $annot_orig{$temp[0]}[4]=$temp1[6]; # all freq allele A cauc # het is available for 50K chip, but calculate on the fly anyway (it's in temp[18]) $annot_orig{$temp[0]}[5]=1-$temp1[6]*$temp1[6]-(1-$temp1[6])*(1-$temp1[6]); #print "Het for freq $temp1[6] = $annot_orig{$temp[0]}[5]\n"; #print "annotation line = @{$annot{$temp[0]}}\n"; $annot_orig{$temp[0]}[6]=$temp[2]; # rs name $annot_orig{$temp[0]}[7]=$temp[6]; # genomic pos $line_cnt+=1; if($line_cnt%10000==0){ print "Read in $line_cnt SNP annotation lines\n"; #exit(1); } } } print "Total total number of SNPs in the Affy annotation file = $totalline_cnt\n"; print "Total number of SNPs in Affymetrix annotation file = $line_cnt\n"; print "Total of $snpprobcnt SNPs have annotation problems\n"; print "Total number of markers removed (5.0 chip upwards), since not represented in genotyping file = $missthismarker\n\n"; #exit(1); } # check that all markers in the genotype data are in fact available in the annotation data file & remove all mendelian error SNPs & all uninformative markers sub remove_uninf{ my $key; my $i; my $m; my $test=0; my %reject_snps=(); my $k; my @totest=(); SNP:foreach $key (keys %genos_orig){ $i=0; $test=0; while($test==0 && $i<=$#which){ if($genos_orig{$key}[$which[$i]]!=0){ $test=1; } $i+=1; } for($k=$i;$k<=$#which;$k++){ if($genos_orig{$key}[$which[$k]]!=$test){ next SNP; } } $reject_snps{$key}=1; } print "Checking: no of snps prior to uninformative marker removal = ",scalar keys %annot_orig,"\n"; print "Total no of uninformative markers detected = ",scalar keys (%reject_snps),"\n"; foreach $key (keys %reject_snps){ delete($genos_orig{$key}); delete($annot_orig{$key}); } print "Checking: Number of snps now, after removal = ",scalar keys %annot_orig,"\n"; } # find all mendelian error Snps and remove from both the annotation data and the genotype hashes. sub mendelian_errors{ my $key; my $i; my $j; my $k; my $m; my $l; my $genom; my $genof; my $genoc; my %reject_snps=(); my $sum_total=0; my $xchromsnpcnt=0; my $females=0; my $malemean=0; my $femalemean=0; my %sexcheck=(); my %zstat=(); my $chrom=0; my @chrom_tot=(); my @chrom_mend=(); my $males=0; # sex check first.... IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped for($m=0;$m<=$#which;$m++){ if($which[$m]==$whichsamples[$i]){ last; } } $xchromsnpcnt=0; SNP:foreach $key (keys %genos_orig){ if($annot_orig{$key}[1] eq "X"){ # X chromosome needs special checks if(defined($brlmm)){ if($genos_orig{$key}[$m]==1){ $sexcheck{$ped[$i][1]}+=1; # counts the no of hets if($ped[$i][4]==1){ $genos_orig{$key}[$m]=-1; # fixing males $reject_snps{$key}+=1; } } } else{ if($genos_orig{$key}[$m]==2){ $sexcheck{$ped[$i][1]}+=1; # counts the no of hets if($ped[$i][4]==1){ $genos_orig{$key}[$m]=0; # fixing males $reject_snps{$key}+=1; } } } $xchromsnpcnt+=1; } } $sexcheck{$ped[$i][1]}/=$xchromsnpcnt; #print "sexcheck{$ped[$i][1]}=$sexcheck{$ped[$i][1]}\n"; } } IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped if($ped[$i][4]==1){ $malemean+=$sexcheck{$ped[$i][1]}; $males+=1; } if($ped[$i][4]==2){ $femalemean+=$sexcheck{$ped[$i][1]}; $females+=1; } } } if($males>0){ $malemean/=$males; } if($females>0){ $femalemean/=$females; } # normalised N(0,1) proportions IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped if(defined($sexcheck{$ped[$i][1]}) && $ped[$i][4]==1){ $zstat{$ped[$i][1]}=($sexcheck{$ped[$i][1]}-$malemean)/(sqrt($malemean*(1-$malemean)/$males)); } } } IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped if(defined($sexcheck{$ped[$i][1]}) && $ped[$i][4]==2){ $zstat{$ped[$i][1]}=($sexcheck{$ped[$i][1]}-$femalemean)/(sqrt($femalemean*(1-$femalemean)/$females)); } } } print "Pedigree has a total of ",$males," genotyped males, and a total of ",$females," genotyped females.\n"; print "Mendelian Error distribution on the X chromosome based purely on observed hets per genotyped individual\n"; print "Individual\tSex\tProportion of hets\tZ stat\n"; print "===========================\n"; IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped print "$ped[$i][1]\t"; if($ped[$i][4]==1){ # male print "male\t"; } else{ print "female\t"; } if(defined($sexcheck{$ped[$i][1]})){ printf "%1.2f",$sexcheck{$ped[$i][1]}; print "\t"; } if(defined($zstat{$ped[$i][1]})){ printf "%1.2f",$zstat{$ped[$i][1]}; } print "\n"; } } print "===========================\n"; IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped if($zstat{$ped[$i][1]}>2 || $zstat{$ped[$i][1]} < -2){ print "Individual $ped[$i][1] seems unlikely to have the right sex. Please check your data before proceeding.\n"; exit(1); } } } IND:for($i=0;$i<$pedno;$i++){ # individual to be tested by looking for his/her parents and checking for mendelian errors if($whichsamples[$i]!=0){ # individual has been genotyped for($m=0;$m<=$#which;$m++){ if($which[$m]==$whichsamples[$i]){ #print "Individual $i has a corresponding which file entry of which[$m]=$which[$m]\n"; #$m1=$m; if($ped[$i][2] ne "0"){ # individual is not a founder for($j=0;$j<$pedno;$j++){ if($ped[$j][1] eq $ped[$i][2]){ # find the father's genotyping sample if($whichsamples[$j]!=0){ #print "Found father's genotype: ped[$i][1]=$ped[$i][1],whichsamples$i]=$whichsamples[$i], ped[$i][2]=$ped[$i][2],whichsamples[$j]=$whichsamples[$j]\n"; for($k=0;$k<=$#which;$k++){ if($which[$k]==$whichsamples[$j]){ #print "Individual $j (father) has a corresponding which entry of which[$k]=$which[$k]\n"; last; } } } } if($ped[$j][1] eq $ped[$i][3]){ # find the mother's genotyping sample if($whichsamples[$j]!=0){ #print "Found mother's genotype: ped[$i][1]=$ped[$i][1],whichsamples[$i]=$whichsamples[$i], ped[$i][3]=$ped[$i][3],whichsamples[$j]=$whichsamples[$j]\n"; for($l=0;$l<=$#which;$l++){ if($which[$l]==$whichsamples[$j]){ #print "Individual $j (mother) has a corresponding which entry of which[$l]=$which[$l]\n"; last; } } } } } if(!defined($k)&& !defined($l)){ # test for missing father && missing mother - no test possible next IND; } #print "child is which[$m]=$which[$m], father is which[$k]=$which[$k], mother is which[$l]=$which[$l]\n"; #print "child is $m, parents are $k,$l\n"; #exit(1); # NEED to change this to cater for X linked SNPs. SNP:foreach $key (keys %genos_orig){ #if($key eq "SNP_A-1956797"){ # print "father: genos_orig{$key}[$k]=$genos_orig{$key}[$k], mother: genos_orig{$key}[$l]=$genos_orig{$key}[$l], child: genos_orig{$key}[$m]=$genos_orig{$key}[$m]\n"; #} # insert the brlmm equivalent - simply use +1 on all of the below... # father's check individually if(defined($k)){ $genof=$genos_orig{$key}[$k]; } else{ if(defined($brlmm)){ $genof=-1; } else{ $genof=0; } } if(defined($l)){ $genom=$genos_orig{$key}[$l]; } else{ if(defined($brlmm)){ $genom=-1; } else{ $genom=0; } } $genoc=$genos_orig{$key}[$m]; if(defined($brlmm)){ if(defined($k)){ $genof+=1; } if(defined($l)){ $genom+=1; } $genoc+=1; } # sex check # father's check if($annot_orig{$key}[1] ne "X" || $ped[$i][4]==2){ # X chromosome needs special checks, only carry out DAD check on autosomes, rejection ok for X chromosome for son,but not for daughter if($genof*$genoc>0){ # ensures we have no missing genotype data if(abs($genof-$genoc)==2){ $reject_snps{$key}+=1; next SNP; } } } # mother's check individually if($genom*$genoc>0){ # ensures we have no missing genotype data if(abs($genom-$genoc)==2){ $reject_snps{$key}+=1; next SNP; } } # and a joint check if($annot_orig{$key}[1] ne "X"|| $ped[$i][4]==2){ # X chromosome needs special checks, only carry out DAD check on autosomes, rejection ok for X chromosome, no trio check possible if($genom*$genoc*$genof>0){ # ensures we have no missing genotype data # father AA, offspring AB, mother cannot be AA # father BB, offspring AB, mother cannot be BB if(abs($genof-$genoc)==1 && ($genof-$genom)==0){ if($genof!=2){ $reject_snps{$key}+=1; next SNP; } } # mother AA, offspring AB, father cannot be AA if(abs($genom-$genoc)==1 && ($genof-$genom)==0){ if($genom!=2){ $reject_snps{$key}+=1; next SNP; } } } } } } #next IND; } } } } for($i=0;$i<=22;$i++){ $chrom_tot[$i]=0; $chrom_mend[$i]=0; } SNP:foreach $key (keys %annot_orig){ if($annot_orig{$key}[1] eq "X"){ $chrom=22; $chrom_tot[$chrom]+=1; if(defined($reject_snps{$key})){ $chrom_mend[$chrom]+=1; } } else{ if($annot_orig{$key}[1]=~/\d/){ $chrom=$annot_orig{$key}[1]-1; $chrom_tot[$chrom]+=1; if(defined($reject_snps{$key})){ $chrom_mend[$chrom]+=1; } } } } print "Mendelian Error distribution over all chromosomes\n"; print "Chr MendErr Rate\n"; print "======================\n"; for($i=0;$i<=22;$i++){ print $i+1," $chrom_mend[$i] "; printf "%1.2f",$chrom_mend[$i]/$chrom_tot[$i]; print "\n"; } print "======================\n"; foreach $key (keys %reject_snps){ $sum_total+=$reject_snps{$key}; } print "Checking: no of snps prior to Mendelian error removal = ",scalar keys %annot_orig,"\n"; print "Total no of Mendelian errors detected = $sum_total\n"; print "Total no of SNPs found to have Mendelian errors = ", scalar keys %reject_snps,"\n"; foreach $key (keys %reject_snps){ delete($genos_orig{$key}); delete($annot_orig{$key}); } print "Checking: Number of snps now, after removal = ",scalar keys %annot_orig,"\n"; } sub check_geno_markers{ my $key; my $cnt=0; my @to_remove=(); print "Checking the markers against genotype data and annotation file data and vice versa..\n"; foreach $key (keys %genos_orig){ if(!defined($annot_orig{$key}[0])){ #print "SNP $key is in genotyping data file and not in the annotation file\n"; $cnt+=1; # remove this SNP from the genotyping data file. push(@to_remove,$key); } } print "Total number of markers not in annotation file = $cnt\n"; print "Removing these markers from the genotyping data file...\n"; foreach $key (@to_remove){ delete($genos_orig{$key}); } $cnt=0; # check no of lines now: foreach $key (keys %annot_orig){ $cnt+=1; } @to_remove=(); print "Total number of annotation lines = $cnt\n"; foreach $key (keys %annot_orig){ if(!defined($genos_orig{$key})){ push(@to_remove,$key); } } print "SNPs to remove that are in annot file but not in geno file = ",$#to_remove+1,"\n"; foreach $key (@to_remove){ delete($annot_orig{$key}); } print "Total number of annotation lines after adjustment= ",scalar keys %annot_orig,"\n"; $cnt=0; foreach $key (keys %genos_orig){ $cnt+=1; } print "Total number of genotype lines = $cnt\n"; } # splits all snps into chromosomes and puts them in the RIGHT order, which is necessary for # all .pre style output files. # current storage in annot hash array is 0 = snpname, 1 = chromosome, 2 = allelele A, 3 = cM pos, 4 =all freq A, 5 = het caucasian sub split_snps_to_chromos{ my $i; my $j; my $k; my @chr_order=(); # array of arrays with SNP names as entries my $whichsnp; my $whichcM; my @snporder_sorted=(); my $key; my $line_cnt=0; my $snptotal=0; my $cnt=0; # sort into chromos first foreach $key (keys %annot_orig){ #print "SNP: $key, chromo: $annot_orig{$key}[1], cM: $annot_orig{$key}[3]\n"; #exit(1); if(!defined($annot_orig{$key}[1])){ print "Problem with marker $key\n"; print "SNP: $key, chromo: $annot_orig{$key}[1], cM: $annot_orig{$key}[3]\n"; exit(1); $cnt+=1; } if($annot_orig{$key}[1]!~/\d+/){ # not 1 -22 if($annot_orig{$key}[1]=~/X|x/){ # X chromosome special $chr_snp_cnt_orig[22]+=1; push(@{$chr_order[22]},$key); #print "should be x chromo: $annot_orig{$key}[1]\n"; } else{ # chr y etc and other riff raff $chr_snp_cnt_orig[23]+=1; push(@{$chr_order[23]},$key); #print "should be non x non auto chromo: $annot_orig{$key}[1]\n"; } } else{ $chr_snp_cnt_orig[$annot_orig{$key}[1]-1]+=1; push(@{$chr_order[$annot_orig{$key}[1]-1]},$key); #print "should be autosome: $annot{$key}[1]\n"; } $line_cnt+=1; #if($line_cnt%100000==0){ # print "$line_cnt SNPs put into hash\n"; # exit(1); #} } print "Chr\tTotal SNP #\n"; print "========================\n"; for($i=0;$i<23;$i++){ print $i+1,"\t$chr_snp_cnt_orig[$i]\n"; $snptotal+=$chr_snp_cnt_orig[$i]; } print "========================\n"; print "Total\t$snptotal\n\n"; # sorting the chromosome specific arrays by genetic map position for($i=0;$i<23;$i++){ print %temp_chr_hash=(); @snporder_sorted=(); for($j=0;$j<$chr_snp_cnt_orig[$i];$j++){ $whichsnp=$chr_order[$i][$j]; $whichcM=$annot_orig{$whichsnp}[3]; $temp_chr_hash{$whichsnp}=$whichcM; # now sort into the correct order within each chromo } sub numerically {$temp_chr_hash{$a} <=> $temp_chr_hash{$b}}; @snporder_sorted=sort numerically keys %temp_chr_hash; # creating the final, sorted array (chr) of arrays (snps) of arrays (all the data we need) for($j=0;$j<$chr_snp_cnt_orig[$i];$j++){ if($snporder_sorted[$j] eq "SNP_A-8645531"){ print "Position $j on chr 6\n"; $key=$snporder_sorted[$j]; for($k=0;$k<=7;$k++){ print "$annot_orig{$key}[$k] "; } print "\n\n"; } $key=$snporder_sorted[$j]; for($k=0;$k<=7;$k++){ $chr_order_snp_orig[$i][$j][$k]=$annot_orig{$key}[$k]; } } } } sub print_ordered_snps{ my $i; my $j; my $k; print "Chr\tSNP\tChr(annot)\tAllele\tPos(cM)\tAllele freq\tHet\trs\tbppos\n"; for($i=0;$i<23;$i++){ for($j=0;$j<$chr_snp_cnt_orig[$i];$j++){ #for($j=0;$j<20;$j++){ print $i+1,"\t"; for($k=0;$k<=7;$k++){ print "$chr_order_snp_orig[$i][$j][$k]\t"; } print "\n"; } print "-------------------------\n"; print "$chr_snp_cnt_orig[$j]\n"; print "-------------------------\n"; #exit(1); } } # --------subsetting of the data -------------------------------------- # function to select only a subset of markers # allow for two options: 1) random choice of marker per bin or 2) pick the "best" in the sense of max het. sub select_per_bin{ my $i; my $j; my $k; my $entry; my $key; my $value; my $rand_snp; my @temp_snp_order=(); my @final_list_of_snps=(); my @final_list_of_snps_names=(); my @current_bin_markers=(); my $snp_length=0; my @snporder_sorted=(); my $current_marker_start=0; my @keyorder=(); my $sum_of_snps=(); my $sum_of_snps_orig=(); my @final_list_of_hets=(); my $avge_hets=0; my $gws_hets=0; my $mytest=0; # for error checking within this function my $mydist; my $newmappos; my $oldmappos; my $test=0; my $final=0; # subselect in chr_order_snp & reduce chr_snp_cnt too (these contain all the necessary information for($i=0;$i<23;$i++){ for($j=0;$j<$chr_snp_cnt_orig[$i];$j++){ for($k=0;$k<=$#which;$k++){ if(!defined($genos_orig{$chr_order_snp_orig[$i][$j][0]}[$k])){ # SPECIAL CLAUSE FOR MY SiMULATIONS $genos_orig{$chr_order_snp_orig[$i][$j][0]}[$k]=0; } } } } if($binsize==0){ # special case, don;t select any snps. for($i=0;$i<23;$i++){ $chr_snp_cnt[$i]=$chr_snp_cnt_orig[$i]; for($j=0;$j<$chr_snp_cnt_orig[$i];$j++){ for($k=0;$k<=7;$k++){ $chr_order_snp[$i][$j][$k]=$chr_order_snp_orig[$i][$j][$k]; } for($k=0;$k<=$#which;$k++){ $genos{$chr_order_snp[$i][$j][0]}[$k]=$genos_orig{$chr_order_snp[$i][$j][0]}[$k]; } } } return(); } for($i=0;$i<23;$i++){ $avge_hets=0; if($mytest==1){ print "Chromosome ",$i+1,"\n"; } $snp_length=$binsize; $current_marker_start=0; $j=0; SECTION: while($j<($chr_snp_cnt_orig[$i]-1)){ # find the number of SNPs in the bin of length x cM if($chr_order_snp_orig[$i][$j][3]<$snp_length){ if($snpchoice==0){ push(@current_bin_markers,$j); #print "current_bin_markers=@current_bin_markers\n"; } else{ $bin_het_hash{$j}=$chr_order_snp_orig[$i][$j][5]; } $j+=1; if($j<($chr_snp_cnt_orig[$i]-1)){ next SECTION; # if last marker continue to below, else keep stacking markers on. } } # end of interval sized bin_size: assess which SNP to pick, either random or highest het. $snp_length+=$binsize; # option 1: random snp: if($snpchoice==0 && $mytest==1){ if($#current_bin_markers == -1){ print "No snp in interval ("; printf "%3.2f",$snp_length-$binsize; print ","; printf "%3.2f",$snp_length; print ")\n"; next SECTION; } $final=0; if($mindist>0){ for($k=0;$k<=$#current_bin_markers;$k++){ # this isn't quite right, we actually need to randomly go through all the snps in the interval until a reasonable one is chosen $rand_snp=rand($#current_bin_markers); # chooses a random number between 0 and no of snps in interval $rand_snp=int($rand_snp); # must make into integer #print "Current chosen snp is $rand_snp\n"; $newmappos=$chr_order_snp_orig[$i][$current_bin_markers[$k]][3]; if($#final_list_of_snps == -1){ $oldmappos=0; } else{ $oldmappos=$chr_order_snp_orig[$i][$final_list_of_snps[$#final_list_of_snps]][3]; } if(!defined($chr_order_snp_orig[$i][$current_bin_markers[$k]][3])){ print "chr_order_snp_orig[$i][$current_bin_markers[$#current_bin_markers]][3]=$chr_order_snp_orig[$i][$current_bin_markers[$#current_bin_markers]][3]\n"; exit(1); } $mydist=$newmappos-$oldmappos; if($mydist>$mindist){ $final=$rand_snp; last; } } } if($mytest==1){ print "Chosen snp is $final\n"; } if($#current_bin_markers==0){ if($mydist<$mindist){ print "Marker SNP $chr_order_snp_orig[$i][$current_bin_markers[0]][0] is the only available SNP but does not have enough distance with only "; printf "%1.2f",$mydist; print "cM.\n"; } } push(@final_list_of_snps,$final+$current_bin_markers[0]); # rescale to snp no in chromo push(@final_list_of_snps_names,$chr_order_snp_orig[$i][$final+$current_bin_markers[0]][0]); push(@final_list_of_hets,$chr_order_snp_orig[$i][$final+$current_bin_markers[0]][5]); # snp het.. if($mytest==1){ print "snpchoice is random:\n"; for($k=0;$k<=$#current_bin_markers;$k++){ print "$current_bin_markers[$k] "; } print "\n"; } @current_bin_markers=(); } # option 2: choose SNP with highest het (according to CEPH HAPMAP data) else{ if(scalar keys(%bin_het_hash) == 0){ print "No snp in interval (",$snp_length-$binsize,",",$snp_length,")\n"; next SECTION; } if($mytest==1){ print "snpchoice is highest het:\n"; while (($key,$value) = each (%bin_het_hash)){ printf "%1.2f",$key; print ", "; printf "%1.2f",$value; print "\n"; } } $current_marker_start=$j; sub binnumerically {$bin_het_hash{$b} <=> $bin_het_hash{$a}}; # b and a reversed: sort is in descending order @snporder_sorted=sort binnumerically keys %bin_het_hash; #print "highest het snp is $snporder_sorted[0], het = $bin_het_hash{$snporder_sorted[0]}\n"; # now making sure that the mrkers have a minimum distance apart, at least 0.2 cM $final=0; if($mindist>0){ for($k=0;$k<=$#snporder_sorted;$k++){ $newmappos=$chr_order_snp_orig[$i][$snporder_sorted[$k]][3]; if($#final_list_of_snps == -1){ $oldmappos=0; } else{ $oldmappos=$chr_order_snp_orig[$i][$final_list_of_snps[$#final_list_of_snps]][3]; } if(!defined($chr_order_snp_orig[$i][$snporder_sorted[$k]][3])){ print "chr_order_snp_orig[$i][$final_list_of_snps[$#final_list_of_snps]][3]=$chr_order_snp_orig[$i][$final_list_of_snps[$#final_list_of_snps]][3]\n"; exit(1); } $mydist=$newmappos-$oldmappos; if($mydist>$mindist){ $final=$k; last; } } } if($mytest==1){ print "Chosen snp is $final\n"; } if($#snporder_sorted==0){ if($mydist<$mindist){ print "Marker SNP $chr_order_snp_orig[$i][$snporder_sorted[0]][0] is the only available SNP but does not have enough distance with only "; printf "%1.2f",$mydist; print "cM.\n"; } } push(@final_list_of_snps,$snporder_sorted[$final]); # choose HIghest het snp push(@final_list_of_snps_names,$chr_order_snp_orig[$i][$snporder_sorted[$final]][0]); # snp name.. push(@final_list_of_hets,$chr_order_snp_orig[$i][$snporder_sorted[$final]][5]); # snp het.. } %bin_het_hash=(); } if($mytest==1){ print "Chromosome ",$i+1,":\n"; foreach $entry (@final_list_of_snps){ print "$entry\t"; for($k=0;$k<=7;$k++){ if(defined($chr_order_snp_orig[$i][$entry][$k])){ print "$chr_order_snp_orig[$i][$entry][$k]\t"; } else{ print "\nProblem with [$i][$entry][$k], total count on chromosome is $chr_snp_cnt_orig[$i]\n"; } } print "\n"; } } # now need to only keep the markers listed in final_list_of_snps and ditch the rest # first a check however.. print list of snps to use: print "no of Snps chosen from chromosome ",$i+1," is ",$#final_list_of_snps+1,"\n"; # writing out the new data: $chr_snp_cnt[$i]=$#final_list_of_snps+1; $sum_of_snps+=$chr_snp_cnt[$i]; $sum_of_snps_orig+=$chr_snp_cnt_orig[$i]; for($j=0;$j<=$#final_list_of_snps;$j++){ $avge_hets+=$final_list_of_hets[$j]; for($k=0;$k<=7;$k++){ $chr_order_snp[$i][$j][$k]=$chr_order_snp_orig[$i][$final_list_of_snps[$j]][$k]; } for($k=0;$k<=$#which;$k++){ $genos{$final_list_of_snps_names[$j]}[$k]=$genos_orig{$final_list_of_snps_names[$j]}[$k]; } } $avge_hets/=($#final_list_of_snps+1); print "Average heterozygosity of chosen snps = "; printf "%1.2f", $avge_hets; print "\n\n"; $gws_hets+=$avge_hets; @final_list_of_snps=(); @final_list_of_snps_names=(); #exit(1); } print "Genome wide heterozygosity average = "; printf "%1.2f",$gws_hets/23; print "\n"; print "Total number of SNPs chosen: $sum_of_snps, total number of original snps: $sum_of_snps_orig\n"; } sub print_genos_test{ my $i; my $k; my @testsnp=(); $testsnp[0]="SNP_A-8645531"; $testsnp[1]="SNP_A-1934648"; for($i=0;$i<=$#testsnp;$i++){ print "TEST SNP $testsnp[$i]\n"; print "Individuals\n"; print "82 83 30 31 32 33 29 80 81 84 85\n"; print "genos\n"; for($k=0;$k<=$#which;$k++){ print "$genos{$testsnp[$i]}[$k] "; } print "\n"; print "genos_orig\n"; for($k=0;$k<=$#which;$k++){ print "$genos_orig{$testsnp[$i]}[$k] "; } print "\n"; } } # ----------------OUTPUT functions ------------------------------------------ # merlin output functions for .pre, .freq, .dat and .map files (QTDT format) sub print_pre_merlin{ my $i; my $j; my $k; my $l; my $filename; my $geno; my $c; my $chrom; my $pedidcnt=0; for($c=0;$c<23;$c++){ $chrom=$c+1; $filename=$merlindir."/merlin_".$chrom."_".$pfile.".pre"; #print "writing merlin .pre file $filename...\n"; open(OUT,">$filename")|| die print "Can't open file $filename\n"; IND: for($i=0;$i<$pedno;$i++){ $pedidcnt=0; for($j=0;$j<=$#whichsamples;$j++){ # genotyped individual if($i==$j && $whichsamples[$i]!=0){ # Why do I need this line at all? #print "individual $ped[$i][1] in pedigree, corresponds to $pedidcnt\n"; #print "pedidcnt=$pedidcnt\n"; for($k=0;$k<5;$k++){ print OUT "$ped[$i][$k] "; } if(defined($brlmm)){ # calls are -1 = missing, 0 = 1/1, 1 = 1/2, 2= 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is snp no $k, $chr_order_snp[$c][$k][0], ind count in geno array is $pedidcnt\n"; # shouldn't it be $whichsamples[$j] instead of pedidcnt? $geno=$genos{$chr_order_snp[$c][$k][0]}[$pedidcnt]; if($geno== -1){ print OUT "0/\t0\t"; } if($geno==0){ print OUT "1/\t1\t"; } if($geno==1){ print OUT "1/\t2\t"; } if($geno==2){ print OUT "2/\t2\t"; } } } else{ # crllm calls are 0 for missing, 1= 1/1, 2 =1/2, 3 = 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #$geno=$genos{$chr_order_snp[$c][$k][0]}[$whichsamples[$i]-1]; $geno=$genos{$chr_order_snp[$c][$k][0]}[$pedidcnt]; if(!defined($geno)){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $k th snp, name: $chr_order_snp[$c][$k][0], ind count in geno array is $pedidcnt\n"; #print "Problem with genos{$chr_order_snp[$c][$k][0]}[$whichsamples[$j]-1]=$genos{$chr_order_snp[$c][$k][0]}[$whichsamples[$j]-1]\n"; #for($l=0;$l<=$#whichsamples;$l++){ # print "$genos{$chr_order_snp[$c][$k][0]}[$l] "; #} #print "\n"; exit(1); } if($geno== 0){ print OUT "0/\t0\t"; } if($geno== 1){ print OUT "1/\t1\t"; } if($geno== 2){ print OUT "1/\t2\t"; } if($geno== 3){ print OUT "2/\t2\t"; } if($geno== -1){ print "Problem with b/crlmm file. Assumed to be crlmm but appears to be brlm. Check, and use -brlmm option if necessary.\n."; exit(1); } } } print OUT "$ped[$i][5]\n"; # merlin wants diseases locus always last. next IND; } if($whichsamples[$j]!=0){ # statement A $pedidcnt+=1; } } # not genotyped, all missing.. for($k=0;$k<5;$k++){ print OUT "$ped[$i][$k]\t"; } for($k=0;$k<$chr_snp_cnt[$c];$k++){ print OUT "0/","\t0\t"; } print OUT "$ped[$i][5]\n"; # merlin wants diseases locus always last. } close(OUT); } } sub print_dat_merlin{ my $i; my $j; my $chrom; my $filename; for($i=0;$i<23;$i++){ $chrom=$i+1; $filename=$merlindir."/merlin_".$chrom."_".$pfile.".dat"; #print "writing merlin .dat file $filename...\n"; open(OUT,">$filename")|| die print "Can't open file $filename\n"; for($j=0;$j<$chr_snp_cnt[$i];$j++){ print OUT "M $chr_order_snp[$i][$j][0]\n"; } print OUT "A disease\n"; close(OUT); } } sub print_map_merlin{ my $i; my $j; my $chrom; my $filename; for($i=0;$i<23;$i++){ $chrom=$i+1; $filename=$merlindir."/merlin_".$chrom."_".$pfile.".map"; #print "writing merlin .map file $filename...\n"; open(OUT,">$filename") || die print "Can't open file $filename\n"; print OUT "CHROMOSOME\tMARKER\tPOSITION\n"; for($j=0;$j<$chr_snp_cnt[$i];$j++){ print OUT "$chr_order_snp[$i][$j][1]\t$chr_order_snp[$i][$j][0]\t"; printf OUT "%1.2f",$chr_order_snp[$i][$j][3]; print OUT "\n"; } close OUT; } } # prints out all merlin freq files, one per chromo. sub print_freq_merlin{ my $i; my $j; my $snpfreq; my $chrom; my $filename; for($i=0;$i<23;$i++){ $chrom=$i+1; $filename=$merlindir."/merlin_".$chrom."_".$pfile.".freq"; #print "writing merlin .freq file $filename...\n"; open(OUT,">$filename") || die print "Can't open file $filename\n"; for($j=0;$j<$chr_snp_cnt[$i];$j++){ print OUT "M $chr_order_snp[$i][$j][0]\n"; $snpfreq=$chr_order_snp[$i][$j][4]; if($snpfreq>0 && $snpfreq<1){ print OUT "F "; printf OUT "%1.2f",$snpfreq; print OUT " "; printf OUT "%1.2f",1-$snpfreq; print OUT "\n"; } else{ # zero freq in hapmap sample if($snpfreq==0){ print OUT "F 0.01 0.99\n"; } else{ print OUT "F 0.99 0.01\n"; } } } close(OUT); } } # making the 23 merlin chromosome run files. sub print_merlin_run_files{ my $i; my $filename; my $whichmerlin; $whichmerlin="merlin"; for($i=1;$i<=23;$i++){ $filename=$merlindir."/merlin_".$i."_".$pfile.".in"; open(OUT,">$filename") || die print "Can't open file $filename\n"; if($i==23){ $whichmerlin="minx"; } # all necessary input files. # make a special case for the X chromosome - not yet done. # getting all double recombinant errors first. print OUT "mv param.tbl temp.tbl\n"; # so parametric linkage analysis isn't run in the double recombinant step next print OUT "$whichmerlin -d merlin_",$i,"_",$pfile,".dat -p merlin_",$i,"_",$pfile,".pre -f merlin_",$i,"_",$pfile,".freq -m merlin_",$i,"_",$pfile,".map"; # analysis options wanted print OUT " --error\n"; print OUT "cp merlin.err merlin_",$i,"_",$pfile,".err\n"; # running pedwipe to get rid of double recombinant errors print OUT "pedwipe -d merlin_",$i,"_",$pfile,".dat -p merlin_",$i,"_",$pfile,".pre -f merlin_",$i,"_",$pfile,".freq -m merlin_",$i,"_",$pfile,".map\n"; print OUT "cp wiped.dat merlin_",$i,"_",$pfile,".dat\n"; print OUT "cp wiped.ped merlin_",$i,"_",$pfile,".pre\n"; print OUT "mv temp.tbl param.tbl\n"; print OUT "$whichmerlin -d merlin_",$i,"_",$pfile,".dat -p merlin_",$i,"_",$pfile,".pre -f merlin_",$i,"_",$pfile,".freq -m merlin_",$i,"_",$pfile,".map"; # analysis options wanted #print OUT " --ibd --extended --founders --best --error --model param.tbl --pdf > merlin_",$i,"_",$pfile,".out"; print OUT " --smallswap --megabytes:9999 --founders --grid 0.3 --ibd --extended --best --exp --npl --model param.tbl --pdf > merlin_",$i,"_",$pfile,".out\n"; print OUT "cp merlin.ibd merlin_",$i,"_",$pfile,".ibd\n"; print OUT "cp merlin.chr merlin_",$i,"_",$pfile,".chr\n"; print OUT "cp merlin.pdf merlin_",$i,"_",$pfile,".pdf\n"; print OUT "cp merlin.flow merlin_",$i,"_",$pfile,".flow\n"; print OUT "cp merlin.hap merlin_",$i,"_",$pfile,".hap\n"; print OUT "cp merlin.s15 merlin_",$i,"_",$pfile,".s15\n"; close(OUT); } } # Allegro output files: # pre file for allegro, matched LINKAGE format sub print_pre_allegro{ my $i; my $j; my $k; my $filename; my $geno; my $c; my $chrom; my $pedidcnt=0; for($c=0;$c<23;$c++){ $chrom=$c+1; $filename=$allegrodir."/allegro_".$chrom."_".$pfile.".pre"; #print "writing allegro .pre file $filename...\n"; open(OUT,">$filename"); IND: for($i=0;$i<$pedno;$i++){ $pedidcnt=0; for($j=0;$j<=$#whichsamples;$j++){ # genotyped individual if($i==$j && $whichsamples[$i]!=0){ #print "individual $ped[$i][1] in pedigree, corresponds to $whichsamples[$j]\n"; #print "pedidcnt=$pedidcnt\n"; for($k=0;$k<=5;$k++){ print OUT "$ped[$i][$k] "; } if(defined($brlmm)){ # calls are -1 = missing, 0 = 1/1, 1 = 1/2, 2= 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $chr_order_snp[$c][$k][0], ind count in geno array is $pedidcnt\n"; $geno=$genos{$chr_order_snp[$c][$k][0]}[$pedidcnt]; if($geno== -1){ print OUT "0 0\t"; } if($geno==0){ print OUT "1 1\t"; } if($geno==1){ print OUT "1 2\t"; } if($geno==2){ print OUT "2 2\t"; } if($geno==3){ print "Problem with BRLMM calls. Found impossible 3 calls. Check that the -brlmm option is off.\n"; exit(1); } } } else{ # crllm calls are 0 for missing, 1= 1/1, 2 =1/2, 3 = 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $chr_order_snp[$c][$k][0], ind count in geno array is $pedidcnt\n"; $geno=$genos{$chr_order_snp[$c][$k][0]}[$pedidcnt]; if($geno== 0){ print OUT "0 0\t"; } if($geno== 1){ print OUT "1 1\t"; } if($geno== 2){ print OUT "1 2\t"; } if($geno== 3){ print OUT "2 2\t"; } if($geno== -1){ print "Problem with CRLMM calls. Found impossible -1 calls. Check that the -brlmm option is ON if you have BRLMM calls.\n"; exit(1); } } } print OUT "\n"; next IND; } if($whichsamples[$j]!=0){ # statement A $pedidcnt+=1; } } # not genotyped, all missing.. for($k=0;$k<=5;$k++){ print OUT "$ped[$i][$k] "; } for($k=0;$k<$chr_snp_cnt[$c];$k++){ print OUT "0 0 "; } print OUT "\n"; } close(OUT); } } # allegro style .dat files which are the LINKAGE style .dat files. sub print_dat_allegro{ my $j; my $c; my $chrom; my $filename; my $snpfreq; my $dist; for($c=0;$c<23;$c++){ $chrom=$c+1; $filename=$allegrodir."/allegro_".$chrom."_".$pfile.".dat"; #print "writing allegro .dat file $filename...\n"; open(OUT,">$filename"); print OUT $chr_snp_cnt[$c]+1," 0 0 5\n"; print OUT "0 0.0 0.0 0\n"; for($j=1;$j<=$chr_snp_cnt[$c];$j++){ print OUT "$j "; } print OUT "\n"; print OUT "1 2\n"; # disease alle freqs print OUT "0.9999 0.0001\n"; # CHANGED BY USER AS NECESSARY print OUT "1\n"; print OUT "0.0001 0.9999 0.9999\n";# CHANGED BY USER AS NECESSARY # now all the marker data: for ($j=0;$j<$chr_snp_cnt[$c];$j++){ print OUT "3 2 # $chr_order_snp[$c][$j][0]\n"; # contains marker name as well $snpfreq=$chr_order_snp[$c][$j][4]; if($snpfreq>0 && $snpfreq<1){ printf OUT "%1.2f",$snpfreq; print OUT " "; printf OUT "%1.2f",1-$snpfreq; print OUT "\n"; } else{ # zero freq in hapmap sample if($snpfreq==0){ print OUT "0.01 0.99\n"; } else{ print OUT "0.99 0.01\n"; } } } # now the map stuff print OUT "0 0\n"; for($j=1;$j<$chr_snp_cnt[$c];$j++){ $dist=$chr_order_snp[$c][$j][3]-$chr_order_snp[$c][$j-1][3]; printf OUT "%1.2f",$dist; print OUT " "; } print OUT "\n"; print OUT "1 0.10000 0.450000\n"; close(OUT); } } # making the 23 allegro chromosome run files. # unlike the merlin files above these still have to be run with the command allegro allegro_x.in in a separate # shell file sub print_allegro_run_files{ my $i; my $haplofile; my $ihaplofile; my $founderfile; my $inherfile; my $paramfile; my $uninffile; my $filename; $filename=$allegrodir."/runallegrofiles_".$pfile.".sh"; open(OUT1,">$filename"); for($i=1;$i<=23;$i++){ $filename=$allegrodir."/allegro_".$i."_".$pfile.".in"; open(OUT,">$filename"); print OUT1 "allegro allegro_".$i."_".$pfile.".in\n"; # all necessary input files. # make a special case for the X chromosome print OUT "PREFILE allegro_".$i."_".$pfile.".pre\n"; print OUT "DATFILE allegro_".$i."_".$pfile.".dat\n\n"; $paramfile="param_".$i.".out"; if($i<23){ print OUT "MODEL mpt par $paramfile\n"; # can stick in model params here if desired } else{ print OUT "MODEL mpt par X $paramfile\n"; # can stick in model params here if desired } $haplofile="haplo_".$i."_".$pfile.".out"; $ihaplofile="ihaplo_".$i."_".$pfile.".out"; $founderfile="founder_".$i."_".$pfile.".out"; $inherfile="inher_".$i."_".$pfile.".out"; print OUT "HAPLOTYPE $haplofile $ihaplofile $founderfile $inherfile\n"; $uninffile="uninf_".$i."_".$pfile.".out"; print OUT "UNINFORMATIVE $uninffile\n"; print OUT "PAIRWISEIBD mpt genotyped\n"; # generates two files: prior.mpt and posterior.mpt. close(OUT); } close(OUT1); } # PREST mcpeek & co, to check for sample swaps/pedigree errrors # NOte that the pedigree needs to have all integer pedids and indids. I'm assuming for the moment that this is ok. More easily fixed ad hoc outside of linkdatagen for the moment. # idx file is very similar to .dat file but has less in it. Needs recomb fractions, not cM distances. sub print_PREST_idx{ my $j; my $c; my $chrom; my $filename; my $snpfreq; my $dist; my $newdist; my $first; my $second; for($c=0;$c<23;$c++){ $chrom=$c+1; $filename=$prestdir."/prest_".$chrom."_".$pfile.".idx"; print "writing PREST .idx file $filename...\n"; open(OUT,">$filename"); print OUT "$chr_snp_cnt[$c]\n"; # now all the marker data: for ($j=0;$j<$chr_snp_cnt[$c];$j++){ print OUT "3 2 # $chr_order_snp[$c][$j][0]\n"; # contains marker name as well $snpfreq=$chr_order_snp[$c][$j][4]; $first=sprintf("%1.2f",$snpfreq); $second=1-$first; if($snpfreq>0 && $snpfreq<1){ printf OUT "$first $second\n"; } else{ # zero freq in hapmap sample if($snpfreq==0){ print OUT "0.01 0.99\n"; } else{ print OUT "0.99 0.01\n"; } } } # now the map stuff for($j=1;$j<$chr_snp_cnt[$c];$j++){ $dist=$chr_order_snp[$c][$j][3]-$chr_order_snp[$c][$j-1][3]; # change from cM to r via Haldane (lazy!) map function $newdist=(1-exp(-2*$dist/100))/2; printf OUT "%1.2f",$newdist; print OUT " "; } close(OUT); } } # output files for MORGAN (Thomson et al) sub print_MORGAN{ my $c; my $filename; my $mfilename; my $pfilename; my $chrom; my $i; my $snpfreq; my $first; my $second; my $j; my $k; my $geno; my $cnt=0; my $lmauto; # print out the parameter file $filename=$morgandir."/runmorgan.sh"; open(OUT3,">$filename"); for($c=0;$c<23;$c++){ $chrom=$c+1; print OUT3 "lm_markers morgan_".$chrom."_".$pfile.".par\n"; $filename=$morgandir."/morgan_".$chrom."_".$pfile.".par"; $mfilename=$morgandir."/morgan_".$chrom."_".$pfile.".mar"; print "writing MORGAN .mar file $filename...\n"; print "writing MORGAN .par file $mfilename...\n"; $pfilename=$morgandir."/morgan_".$chrom."_".$pfile.".ped"; print "writing MORGAN .ped file $pfilename...\n"; open(OUT1,">$filename"); print OUT1 "input pedigree file 'morgan_".$chrom."_".$pfile.".ped'\n"; print OUT1 "input pedigree size $pedno\n"; print OUT1 "input pedigree record names 3 integers 2\n"; # two integers to describe a) gender (assumed to be present) and b) affectedness = a binary trait print OUT1 "input marker data file 'morgan_".$chrom."_".$pfile.".mar'\n"; # specifies marker data file name # for lm_markers or lm_bayes #print OUT1 "map trait 1 all interval proportions 0.5\n"; # for lm_auto $lmauto=1; if($lmauto==1){ print OUT1 "map trait "; for($i=0;$i<$chr_snp_cnt[$c];$i++){ print OUT1 "1 "; } print OUT1 "marker "; for($i=1;$i<=$chr_snp_cnt[$c];$i++){ print OUT1 "$i "; } print OUT1 "distances "; for($i=0;$i<$chr_snp_cnt[$c];$i++){ printf OUT1 "%1.1f", $chr_order_snp[$c][$i][3]; print OUT1 " "; } print OUT1 "\n"; } print OUT1 "set trait data discrete\n"; print OUT1 "set incomplete penetrances 0.0001 0.0001 0.9999\n"; print OUT1 "select all markers traits 1\n"; print OUT1 "set trait 1 freqs 0.999 0.001\n"; # Monte Carlo setup and requests print OUT1 "sample by scan\n"; print OUT1 "set L-sampler probability 0.2\n"; ## For real analyses, recommended number of iterations is of order 10^4 ## at each simulation position print OUT1 "set MC iterations 10000\n"; print OUT1 "set burn-in iterations 300\n"; close(OUT1); # print out the pedigree file - no genotyping data allowed in this, and no pedigree qualifier in front of person's id open(OUT,">$pfilename"); print OUT "****************\n"; for($i=0;$i<$pedno;$i++){ for($k=1;$k<=5;$k++){ print OUT "$ped[$i][$k] "; } print OUT "\n"; } # print out the marker file containing the genotype data close(OUT); open(OUT2,">$mfilename"); # map data first #map marker positions 0.94 3.29 3.56 4.26 4.59 5.04 5.70 6.35 6.59 7.47 7.94 8.40 9.12 11.47 12.04 12.87 13.10 13.73 14.16 14.73 16.74 17.26 18.00 18.86 19.18 20.23 21.30 21.88 22.04 24.03 24.52 25.22 25.58 26.30 26.72 27.07 27.51 28.22 28.96 29.69 31.02 31.61 33.18 33.77 34.09 34.94 37.36 38.00 38.50 38.90 39.46 39.56 40.47 40.98 41.42 41.66 42.39 42.82 43.06 43.56 44.55 45.27 45.52 46.23 46.51 47.12 47.83 48.41 48.64 49.18 50.47 51.08 51.78 52.54 53.43 53.90 54.13 54.90 55.22 55.69 56.02 57.23 57.62 58.10 58.55 59.32 60.32 60.91 61.23 61.91 62.49 62.79 63.12 63.94 64.11 64.51 65.31 65.80 66.26 66.69 67.27 67.88 68.08 68.70 69.32 69.80 70.43 70.62 71.42 71.57 72.07 72.66 73.37 73.51 74.74 75.37 75.77 76.77 77.09 77.85 78.04 78.62 79.45 79.69 80.07 80.68 81.30 81.96 82.49 82.58 83.12 83.83 84.38 84.53 85.49 85.71 86.43 86.99 87.12 87.70 88.36 88.70 89.29 89.53 90.05 90.51 91.29 91.77 92.04 92.57 93.28 93.66 94.25 94.87 95.14 95.85 96.09 96.95 97.35 97.78 98.44 98.58 99.36 99.55 100.10 100.78 101.46 101.83 102.34 102.77 103.09 103.59 104.36 104.88 105.29 105.83 106.04 106.85 107.10 107.56 108.21 108.90 109.30 109.79 110.09 110.74 111.36 111.80 112.50 112.63 113.19 113.88 114.36 114.98 115.47 115.68 116.09 116.91 117.43 117.89 118.10 118.98 119.05 119.50 120.47 120.58 121.24 121.73 122.38 122.74 123.30 123.56 124.31 124.97 125.26 125.63 126.34 126.72 127.12 127.68 128.19 128.67 129.37 129.61 130.32 130.65 131.09 131.87 132.31 132.96 133.48 133.93 134.00 134.62 135.31 135.68 136.90 137.07 137.63 138.20 138.98 139.15 139.74 140.05 140.74 141.02 141.94 142.06 143.10 143.74 144.03 144.70 145.24 145.94 146.41 146.68 147.31 147.82 148.85 149.86 150.45 150.95 151.40 151.68 152.07 152.65 153.30 153.51 154.47 154.67 155.72 156.49 156.70 157.28 157.99 158.24 158.95 159.28 159.81 160.01 160.67 161.03 161.60 162.92 163.04 163.61 164.36 164.83 165.18 165.72 166.21 166.68 167.16 167.58 168.96 169.02 169.56 170.40 170.52 171.09 171.77 172.08 172.60 173.01 173.63 174.02 174.98 175.47 175.99 176.26 176.74 177.16 177.76 178.31 178.70 179.20 179.61 180.06 180.97 181.40 181.74 182.20 182.81 183.02 183.90 184.42 184.65 185.11 185.70 186.19 186.86 187.16 187.54 188.12 188.91 189.20 189.55 190.31 190.77 191.16 191.84 192.17 192.57 193.45 193.89 194.37 194.94 195.31 195.66 196.29 196.64 197.14 197.62 198.45 198.69 199.38 199.97 200.14 201.17 201.68 202.16 202.88 203.24 203.69 204.36 204.81 205.38 205.75 206.34 206.68 207.04 208.42 208.71 209.09 209.75 210.20 210.89 211.22 211.79 212.04 212.91 213.36 213.51 214.32 214.92 215.07 215.96 216.02 216.70 217.18 217.99 218.20 218.60 219.22 219.58 220.12 220.81 221.18 221.92 222.35 222.54 223.12 223.84 224.35 224.95 225.32 225.57 226.35 226.99 227.10 227.95 228.13 228.88 229.36 229.53 230.43 230.53 231.25 231.68 232.25 232.81 233.31 233.51 234.49 234.99 235.38 235.67 237.48 237.86 238.27 238.98 239.35 239.98 240.08 240.79 241.48 242.35 242.84 244.13 244.86 245.11 245.66 246.45 246.97 247.01 247.86 248.32 248.65 249.42 249.92 250.24 250.91 251.02 251.65 252.04 252.54 253.42 253.82 254.35 254.81 255.41 255.63 256.02 256.54 257.46 257.65 258.06 259.96 260.13 260.73 261.14 261.58 262.48 262.81 263.91 264.46 267.65 269.37 269.86 270.04 270.67 271.74 272.91 273.68 274.31 275.24 275.64 275.69 print OUT2 "map marker positions "; for($i=0;$i<$chr_snp_cnt[$c];$i++){ printf OUT2 "%1.2f", $chr_order_snp[$c][$i][3]; print OUT2 " "; } print OUT2 "\n\n"; # marker frequency data #set markers 1 freqs 0.85 0.15 for ($j=0;$j<$chr_snp_cnt[$c];$j++){ print OUT2 "set markers ",$j+1," freq "; $snpfreq=$chr_order_snp[$c][$j][4]; $first=sprintf("%1.2f",$snpfreq); $second=1-$first; if($snpfreq>0 && $snpfreq<1){ printf OUT2 "$first $second\n"; } else{ # zero freq in hapmap sample if($snpfreq==0){ print OUT2 "0.01 0.99\n"; } else{ print OUT2 "0.99 0.01\n"; } } } print OUT2 "\n\n"; # genotyping data # format is indid m1geno m2geno etc print OUT2 "set markers $chr_snp_cnt[$c] data \n"; $cnt=0; for($i=0;$i<$pedno;$i++){ if($whichsamples[$i]!=0){ print OUT2 "$ped[$i][1] "; if(defined($brlmm)){ # calls are -1 = missing, 0 = 1/1, 1 = 1/2, 2= 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $chr_order_snp[$c][$k][0], ind count in geno array is $pedidcnt\n"; $geno=$genos{$chr_order_snp[$c][$k][0]}[$cnt]; if($geno== -1){ print OUT2 "0 0\t"; } if($geno==0){ print OUT2 "1 1\t"; } if($geno==1){ print OUT2 "1 2\t"; } if($geno==2){ print OUT2 "2 2\t"; } } } else{ # crllm or birdseed calls are 0 for missing, 1= 1/1, 2 =1/2, 3 = 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $chr_order_snp[$c][$k][0], ind count in geno array is $i\n"; $geno=$genos{$chr_order_snp[$c][$k][0]}[$cnt]; if($geno== 0){ print OUT2 "0 0\t"; } if($geno== 1){ print OUT2 "1 1\t"; } if($geno== 2){ print OUT2 "1 2\t"; } if($geno== 3){ print OUT2 "2 2\t"; } #exit(1); } } $cnt+=1; print OUT2 "\n"; } } close(OUT2); } close(OUT3); } # output files for PLINK sub print_PLINK{ my $c; my $chrom; my $i; my $j; my $k; my $geno; my $temp; my $cnt=0; my $filename; $filename=$plinkdir."/plink.map"; open(OUT2,">$filename"); # print out the parameter file for($c=0;$c<23;$c++){ $chrom=$c+1; for($i=0;$i<$chr_snp_cnt[$c];$i++){ print OUT2 "$chrom "; print OUT2 "$chr_order_snp[$c][$i][0] "; printf OUT2 "%1.2f", $chr_order_snp[$c][$i][3]; print OUT2 " "; $temp=1000000*$chr_order_snp[$c][$i][3]; printf OUT2 "%1.0f", $temp; print OUT2 " "; print OUT2 "\n"; } } close(OUT2); $filename=$plinkdir."/plink.ped"; open(OUT2,">$filename"); for($i=0;$i<$pedno;$i++){ if($whichsamples[$i]!=0){ print OUT2 "1 "; print OUT2 "$ped[$i][1] "; print OUT2 "0 0 "; # pretend that they are all founders... for($k=4;$k<=5;$k++){ print OUT2 "$ped[$i][$k] "; } for($c=0;$c<23;$c++){ if(defined($brlmm)){ # calls are -1 = missing, 0 = 1/1, 1 = 1/2, 2= 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $chr_order_snp[$c][$k][0], ind count in geno array is $pedidcnt\n"; $geno=$genos{$chr_order_snp[$c][$k][0]}[$cnt]; if($geno== -1){ print OUT2 "0 0 "; } if($geno==0){ print OUT2 "1 1 "; } if($geno==1){ print OUT2 "1 2 "; } if($geno==2){ print OUT2 "2 2 "; } } } else{ # crllm or birdseed calls are 0 for missing, 1= 1/1, 2 =1/2, 3 = 2/2 for($k=0;$k<$chr_snp_cnt[$c];$k++){ #print "Current chromo is ",$c+1,". Snp count is $chr_snp_cnt[$c]\n"; #print "SNP is $chr_order_snp[$c][$k][0], ind count in geno array is $i\n"; $geno=$genos{$chr_order_snp[$c][$k][0]}[$cnt]; if($geno== 0){ print OUT2 "0 0 "; } if($geno== 1){ print OUT2 "1 1 "; } if($geno== 2){ print OUT2 "1 2 "; } if($geno== 3){ print OUT2 "2 2 "; } #exit(1); } } } $cnt+=1; print OUT2 "\n"; } } close(OUT2); close(OUT); } # output files for personal format, only prints out .pre style files but a) transposed from PLINK and b) per chromosome # affecteds are clumped together in the final columns, separated by a tab insteaf of a whitespace from the unaffecteds. sub print_MB{ my $c; my $chrom; my $i; my $j; my $k; my $geno; my $temp; my $cnt=0; my $filename; for($c=0;$c<23;$c++){ $chrom=$c+1; $filename=$mbdir."/mb_".$chrom.".ped"; open(OUT2,">$filename"); print OUT2 "Chr\tSNP\trsname\tgenmap\tphysmap\t"; for($i=0;$i<$pedno;$i++){ if($whichsamples[$i]!=0 && $ped[$i][5]!=2){ # print out unaffecteds first, then affecteds print OUT2 "$ped[$i][1]\t"; } } for($i=0;$i<$pedno;$i++){ if($whichsamples[$i]!=0 && $ped[$i][5]==2){ # print out unaffecteds first, then affecteds print OUT2 "$ped[$i][1]\t"; } } print OUT2 "\n"; for($k=0;$k<$chr_snp_cnt[$c];$k++){ print OUT2 "$chrom\t$chr_order_snp[$c][$k][0]\t"; # chr and snp name (affy style) print OUT2 "$chr_order_snp[$c][$k][6]\t"; #rs name printf OUT2 "%1.2f", $chr_order_snp[$c][$k][3]; # chr cM pos print OUT2 "\t"; print OUT2 $chr_order_snp[$c][$k][7]; # chr bp pos print OUT2 "\t"; $cnt=0; for($i=0;$i<$pedno;$i++){ if($whichsamples[$i]!=0){ if($ped[$i][5]!=2){ # print out unaffecteds first, then affecteds print OUT2 "$genos{$chr_order_snp[$c][$k][0]}[$cnt]\t"; } $cnt+=1; } } print OUT2 "\t\t"; $cnt=0; for($i=0;$i<$pedno;$i++){ if($whichsamples[$i]!=0){ if($ped[$i][5]==2){ # print out unaffecteds first, then affecteds print OUT2 "$genos{$chr_order_snp[$c][$k][0]}[$cnt]\t"; } $cnt+=1; } } print OUT2 "\n"; } close(OUT2); } }