LINUXexit; 如果想知道数据库里模式的具体位置,我们必须使用特殊变量`$&',这个变量在对正则表达式求值后仍然保存着找到的模式(应该将它放在`if($$secuencia_total>=~/$$patron>/一句的后面)。另外,可以将变量`$`'和`$´'组合起来使用,它们会将找到的模式的左右位置的信息保存。将这些变量正确的加入前面的程序中,我们就可以给出模式的确切位置。注意:length也是非常有用的,它会给出一串数据的长度。 #forthegivenpattern #andcheckitspositioninthesequence if($secuencia_total=~/$patron/){ $posicion=length($`) 1; print"Thesequencequery_seq.txtcontainsthepattern$patroninthe followingposition$posicion\n";}else{ print"Thesequencequery_seq.txtdoesn'tcontainsthepattern$patron\n"; } 计算氨基酸的频度(Calculusofaminoacidfrequences):不同蛋白质里,特定的氨基酸出现的频度是不同的,这是因为他们处在不同的环境里面、并且功能不同。下面,我们给出一个例子来展示如何计算给定氨基酸序列里某种氨基酸频度。 #!/usr/bin/perl#Calculatesthefrequencyofaminoacidinaproteinicsequence#Getsthefilenamefromthecommandline#(SWISS-PROTformatted)#Alsocanbeaskedwithprintfromthe<STDIN>if(!$ARGV[0]){print"Theexecutionlineshallbe:program.plfile_swissprot\n";}$fichero=$ARGV[0];#Initializethevariable$erroresmy$errores=0;#Openthefileforreadingopen(FICHA,"$fichero")||die"problemopeningthefile$fichero\n";#Firstwecheckthesequenceasdidintheexample2while(<FICHA>){chomp$_;if($_=~/^SQ/){$signal_good=1;}elsif($signal_good==1){lastif($_=~/^\/\//);$_=~s/\s//g;$secuencia.=$_;}}close(FICHA);#Nowuseacurlthatcheckseverypositionoftheaminoacid#inthesequence(fromafuncionofitsown,thatcanbeusedafterinother#programs)comprueba_aa($secuencia);#Printtheresultstothescreen#Firstthe20aminoacidsandthenthearraywiththeirfrequencies#Inthiscase'sort'can'tbeusedinforeach,#becausethearraycontainsthefrequencies(numbers)print"A\tC\tD\tE\tF\tG\tH\tI\tK\tL\tM\tN\tP\tQ\tR\tS\tT\tV\tW\tY\n";foreach$each_aa(@aa){print"$each_aa\t";}#Tenitgivesthepossibleerrors#andendstheprogramprint"\nerrores=$errores\n";exit;#Functions#Thisonecalculateseachaminoacidfrequency#fromaproteinicsequencesubcomprueba_aa{#Getsthesequencemy($secuencia)=@_;#andrunsaminoacidbyaminoacid,usingaforrunning#from0untilthesequencelengthfor($posicion=0;$posicion<length$secuencia;$posicion ){#Getstheaminoacid$aa=substr($secuencia,$posicion,1);#andcheckswhichoneisusingif#whenitischeckeditaggregates1tothecorrespondantfrequency#inanarrayusingapointerforeachone#orderedinalphabeticwayif($aaeq'A'){$aa[0] ;}elsif($aaeq'C'){$aa[1] ;}elsif($aaeq'D'){$aa[2] ;}elsif($aaeq'E'){$aa[3] ;}elsif($aaeq'F'){$aa[4] ;}elsif($aaeq'G'){$aa[5] ;}elsif($aaeq'H'){$aa[6] ;}elsif($aaeq'I'){$aa[7] ;}elsif($aaeq'K'){$aa[8] ;}elsif($aaeq'L'){$aa[9] ;}elsif($aaeq'M'){$aa[10] ;}elsif($aaeq'N'){$aa[11] ;}elsif($aaeq'P'){$aa[12] ;}elsif($aaeq'Q'){$aa[13] ;}elsif($aaeq'R'){$aa[14] ;}elsif($aaeq'S'){$aa[15] ;}elsif($aaeq'T'){$aa[16] ;}elsif($aaeq'V'){$aa[17] ;}elsif($aaeq'W'){$aa[18] ;}elsif($aaeq'Y'){$aa[19] ;#Iftheaminoacidisnotfound#itaggregates1totheerrors}else{print"ERROR:Aminoacidnotfound:$aa\n";$errores ;}}#Finallyreturnstothefrequencyarrayreturn@aa;} 下面就让我们跟着大自然的步伐,看看细胞中的信息流向了何方。其中之一就是转录,RNA从DNA(基因)中复制出遗传信息,然后又将这些信息传递给蛋白质或者氨基酸序列。为此,我们必须使用与氨基酸对应的基因密码--所谓的RNA/DNA三联密码子。我们要提取Escherichiacoli(一种埃[舍利]希氏杆菌属的大肠杆菌)的基因所对应的氨基酸序列,而这些信息都是以EMBL(EuropeanMolecularBiologyLaboratory)要求的格式。做完这些转换之后,我们将与已有的转录信息校验。对这个例子,非常有必要引进数组的关联变量(associativevariablesofarrays)和哈希表。 #!/usr/bin/perl#TranslatesanADNsequencefromanEMBLfiche#totheaminoacidcorrespondant#Getsthefilenamefromthecommandline#(SWISS-PROTformatted)#Alsocanbeaskedwithprintfromthe<STDIN>if(!$ARGV[0]){print"Theprogramlineshallbe:program.plficha_embl\n";}$fichero=$ARGV[0];#Openthefileforreadingopen(FICHA,"$fichero")||die"problemopeningthefile$fichero\n";#Firstwecheckthesequenceasdidintheexample2while(<FICHA>){chomp$_;if($_=~/^FTCDS/){$_=~tr/..//;($a1,$a2,$a3,$a4)=split("",$_);}elsif($_=~/^SQ/){$signal_good=1;}elsif($signal_good==1){lastif($_=~/^\/\//);#Eliminatenumbersandspaces$_=~tr/0-9//;$_=~s/\s//g;$secuencia.=$_;}}close(FICHA);#Nowwedefineanassociatearraywiththecorrepondence#ofeveryaminoacidswiththeirnucleotide#correspondants(alsoinanownfunction,#forifthesamegeneticcodeisusedinotherprogrammy(codigo_genetico)=('TCA'=>'S',#Serine'TCC'=>'S',#Serine'TCG'=>'S',#Serine'TCT'=>'S',#Serine'TTC'=>'F',#Fenilalanine'TTT'=>'F',#Fenilalanine'TTA'=>'L',#Leucine'TTG'=>'L',#Leucine'TAC'=>'Y',#Tirosine'TAT'=>'Y',#Tirosine'TAA'=>'*',#Stop'TAG'=>'*',#Stop'TGC'=>'C',#Cysteine'TGT'=>'C',#Cysteine'TGA'=>'*',#Stop'TGG'=>'W',#Tryptofane'CTA'=>'L',#Leucine'CTC'=>'L',#Leucine'CTG'=>'L',#Leucine'CTT'=>'L',#Leucine'CCA'=>'P',#Proline'CCC'=>'P',#Proline'CCG'=>'P',#Proline'CCT'=>'P',#Proline'CAC'=>'H',#Hystidine'CAT'=>'H',#Hystidine'CAA'=>'Q',#Glutamine'CAG'=>'Q',#Glutamine'CGA'=>'R',#Arginine'CGC'=>'R',#Arginine'CGG'=>'R',#Arginine'CGT'=>'R',#Arginine'ATA'=>'I',#IsoLeucine'ATC'=>'I',#IsoLeucine'ATT'=>'I',#IsoLeucine'ATG'=>'M',#Methionina'ACA'=>'T',#Treonina'ACC'=>'T',#Treonina'ACG'=>'T',#Treonina'ACT'=>'T',#Treonina'AAC'=>'N',#Asparagina'AAT'=>'N',#Asparagina'AAA'=>'K',#Lisina'AAG'=>'K',#Lisina'AGC'=>'S',#Serine'AGT'=>'S',#Serine'AGA'=>'R',#Arginine'AGG'=>'R',#Arginine'GTA'=>'V',#Valine'GTC'=>'V',#Valine'GTG'=>'V',#Valine'GTT'=>'V',#Valine'GCA'=>'A',#Alanine'GCC'=>'A',#Alanine'GCG'=>'A',#Alanine'GCT'=>'A',#Alanine'GAC'=>'D',#AsparticAcid'GAT'=>'D',#AsparticAcid'GAA'=>'E',#GlutamicAcid'GAG'=>'E',#GlutamicAcid'GGA'=>'G',#Glicine'GGC'=>'G',#Glicine'GGG'=>'G',#Glicine'GGT'=>'G',#Glicine);#Translateeverycodoninitscorrespondantaminoacid#andaggregatestotheproteinicsequenceprint$a3;for($i=$a3-1;$i<$a4-3;$i =3){$codon=substr($secuencia,$i,3);#Passthecodonfromsubcase(EMBLformat)touppercase$codon=~tr/a-z/A-Z/;$protein.=codon2aa($codon);}print"Thisproteinicsequenceofthegen:\n$secuencia\nisthefollowing:\n$protein\n\n";exit; 上一篇:ADODB与PearDB的兼容部分 下一篇:构建一个Perl/CGI投票系统 更多相关文章
|
推荐文章
精彩文章
|