#!/usr/bin/perl -w print "\n"; print " ## Support & Confidence calculation\n"; print " ## for TF-TG singTF 3 clusters \n"; print " ## the script is for cluster 28, change c28 to c1 or c2 when necessary \n"; print " ## Minlu Zhang, May 12 2009\n\n"; $dbfile1 = "./tg_clusters_c28.txt"; ## a text list of genes, one gene per line, symbol @ is used to separate genes from different clusters output from HCE 3.5 $dbfile2 = "./tf_clusters_c28.txt"; ## a text list of tf(matrices), one tf(matrix) per line, symbol @ is used to separate tfs from different clusters output from HCE 3.5 $dbfile3 = "./C28_combined_info_mod.txt"; ## information matrix, values in matrix ranges from 0 to 2 $dbfile0 = "./importance_c28_all.txt"; ## importance factor for TF, preliminary open (D0, "$dbfile0") ||warn "$dbfile0 file not opened\n"; while (){ ## chomp; if (/^(\S+)\t(\S+)/){ $motif=$1;$impscore=$2; $imp{$motif}=$impscore; } } close (D0); $i=0; $j=0; $k=0; open (D1, "$dbfile1") ||warn "$dbfile1 file not opened\n"; while (){ ## inputting gene cluster information chomp; if (/^\@/){ ## clusters separated by symbol @ in a single line $i++; $k+=$j; $j=0; }elsif (/^(\S+)/){ $j++; $cg{$1} = $i; ## $cg{gene1} = cluster # $cntcg[$i] = $j; ## number count for cg } } close (D1); $k+=$j; $totalg = $k; ## total number of genes print "total # of genes: $totalg\n"; $i=0; $j=0; $k=0; open (D2, "$dbfile2") ||warn "$dbfile2 file not opened\n"; while (){ ## inputting tf cluster information chomp; if (/^\@/){ ## clusters separated by symbol @ in a single line $i++; $k+=$j; $j=0; }elsif (/^(\S+)/){ $j++; $ct{$1} = $i; ## $ct{tf1} = cluster # $cntct[$i] = $j; ## number count for ct } } close (D2); $k+=$j; ## total # of factors $totalt = $k; print "total # of factors: $totalt\n"; $i=0; $m=0; $n=0; open (D3, "$dbfile3") ||warn "$dbfile3 file not opened\n"; while (){ ## generating tf-tg-score triplet, calculating $lsum chomp; if (/^GENE\t(.*)/){ @tf = split(/\t/,$1); }elsif (/^(\S+)\t(.*)/){ $tg[$i]=$1;@s = split(/\t/,$2); for ($j=0;$j<$#tf+1;$j++){ $score{$tf[$j].$tg[$i]} = $s[$j]/2; ## normalizing scores from 0 to 1 if ($score{$tf[$j].$tg[$i]} > 1){warn "Error: score between $tf[$j] and $tg[$i] is $score{$tf[$j].$tg[$i]}!\n";} if (defined($ct{$tf[$j]})) {$m = $ct{$tf[$j]};} else {warn "$tf[$j] not defined!\n";} if (defined($cg{$tg[$i]})) {$n = $cg{$tg[$i]};} else {warn "$tg[$i] not defined!\n";} ## if (($ct{$tf[$j]}==11)&&($cg{$tg[$i]}==1)){warn "lsum = ",$lsum{$ct{$tf[$j]}."_".$cg{$tg[$i]}}," before adding score $score{$tf[$j]."_".$tg[$i]} between $tf[$j] and $tg[$i]\n";} $lsum{$ct{$tf[$j]}."_".$cg{$tg[$i]}} += $score{$tf[$j].$tg[$i]}; ## for support # if ($lsum{$ct{$tf[$j]}."_".$cg{$tg[$i]}} > ($cntct[$ct{$tf[$j]}]*$cntcg[$cg{$tg[$i]}])){warn "Error here: ct",$ct{$tf[$j]}," and cg",$cg{$tg[$i]}," lsum=$lsum{$ct{$tf[$j]}."_".$cg{$tg[$i]}} while ct count = $cntct[$ct{$tf[$j]}] and cg count = $cntcg[$cg{$tg[$i]}]!\n";} $lscore{$tf[$j].$tg[$i]} = $score{$tf[$j].$tg[$i]}; ## for confidence # if ($lscore{$tf[$j].$tg[$i]} > 1){warn "Error in lscore for $tf[$j] and $tg[$i]!\n";} } $i++; } } close (D3); for ($i=0;$i<$#tg+1;$i++){ ## normalizing $lscore to $lbar (factor 4 in confidence) ranges from 0.5-1 for ($j=0;$j<$#tf+1;$j++){ $lbar{$tf[$j].$tg[$i]} = $lscore{$tf[$j].$tg[$i]}*0.5 +0.5; } } for ($i=0;$i<$#cntct+1;$i++){ ## calculating support scores, or factor 3 in confidence for ($j=0;$j<$#cntcg+1;$j++){ $sp{$i."_".$j} = $lsum{$i."_".$j}/($cntct[$i]*$cntcg[$j]); } } for ($i=0;$i<$#tg+1;$i++){ ## calculating factor 1, 2 in confidence for ($j=0;$j<$#tf+1;$j++){ if (defined($ct{$tf[$j]})||defined($cg{$tg[$i]})){ $m = $ct{$tf[$j]}; $n = $cg{$tg[$i]}; $f1{$tf[$j]."_".$n} += $lscore{$tf[$j].$tg[$i]}/$cntcg[$n];## $factor 1 is the score between Ti and Cgj $f2{$m."_".$tg[$i]} += $lscore{$tf[$j].$tg[$i]}/$cntct[$m];## $factor 2 is the score between Gj and Cti }else {warn "$tf[$j] or $tg[$i] not defined! \n";} } } for ($i=0;$i<$#cntct+1;$i++){ ## printing out support scores for ($j=0;$j<$#cntcg+1;$j++){ print "Ct",$i,"\t","Cg",$j,"\t",$sp{$i."_".$j},"\n"; } } print "Ti", "\t", "Gj", "\t", "Ct", "\t", "Cg", "\t", "Factor 1", "\t", "Factor 2", "\t", "Factor 3(Support)", "\t", "Factor 4", "\t","Factor 5", "\t", "Confidence","\t", "Confidence (normalized)","\n"; for($i=0;$i<$#tg+1;$i++){ ## calculating and printing out confidence scores for ($j=0;$j<$#tf+1;$j++){ $f1 = $f1{$tf[$j]."_".$cg{$tg[$i]}}; $f2 = $f2{$ct{$tf[$j]}."_".$tg[$i]}; $f3 = $sp{$ct{$tf[$j]}."_".$cg{$tg[$i]}}; $f4 = $lbar{$tf[$j].$tg[$i]}; $f5 = $imp{$tf[$j]}; $confy = $f1*$f2*$f3*$f4*$f5; $nconfy = sqrt(sqrt(sqrt($confy))); print $tf[$j],"\t",$tg[$i],"\t","Ct",$ct{$tf[$j]},"\t","Cg",$cg{$tg[$i]},"\t",$f1,"\t",$f2,"\t",$f3,"\t",$f4,"\t",$f5,"\t",$confy,"\t",$nconfy,"\n"; } }