linux 中 shell脚本实现提取gff文件中的最长转录本

发布时间 2023-07-02 22:33:51作者: 小鲨鱼2018

 

001、数据和脚本

[root@PC1 test2]# ls
GCF_001704415.1_ARS1_genomic.gff  record.sh

 

002、脚本

[root@PC1 test2]# cat record.sh        ## 脚本内容
#!/bin/bas

## step1: eliminate the influence of pseudogene
grep -v "^#" GCF_001704415.1_ARS1_genomic.gff | awk -F "\t" 'BEGIN{tag = "yes"}{if($3 == "pseudogene") {tag = "no"}; if($3 == "gene") {tag = "yes"}; if(tag == "yes") {print $0}}' > result.gff

## step2: exclude the genes without exon
awk -F "\t" 'BEGIN{sum = 0}{if($3 == "gene"){sum++} else {sum = 0}; if(sum > 1) {print NR - 1}}' result.gff | while read i
do
        sed -i "$i s/^/detele_tag\t/" result.gff
done
sed -i '/detele_tag\t/d' result.gff

## step3:
num=0
awk -F "\t" '$3 == "gene"{print $NF}' result.gff | cut -d ";" -f 1 | while read i
do                                                                ## 提取基因ID
        let num=$num+1
        check1=$(grep "$i;" result.gff | wc -l)
        if [ $check1 -ne 1 ]
        then
                echo "$i layer1 check"
                exit
        fi
        grep "$i;" result.gff >> 001.gff              ## 基因ID 写入文件
        str1=$(grep "$i;" result.gff | awk -F "\t" '{print $NF}' | cut -d ";" -f 1 | cut -d "=" -f 2)
        check2=$(grep "Parent=$str1;" result.gff | wc -l)
        echo "$i: $check2" >> 002.txt ## 记录每个基因的转录本数目
        if [ $check2 -lt 1 ]
        then
                echo "$i layer 2 check"
                exit
        fi
        grep "Parent=$str1;" result.gff > tmp
        for j in $(seq $check2)
        do                                                ## 提取每一个转录本的ID
                str2=$(sed -n "$j"p tmp | awk -F "\t" '{print $NF}' | cut -d ";" -f 1 | cut -d "=" -f 2)
                check3=$(grep "Parent=$str2;" result.gff | awk -F "\t" '$3 == "exon"' | wc -l)
                if [ $check3 -eq 0 ]
                then
                        echo "$i layer3 check"
                        exit
                fi
                sed -n "$j"p tmp >> 001.gff             ## 转录本ID写入文件
                grep "Parent=$str2;" result.gff | awk -F "\t" '$3 == "exon"' >> 001.gff  ## 外显子写入文件
                grep "Parent=$str2;" result.gff | awk -F "\t" '$3 == "exon"' | awk -v a=$i -v b=$str2 'BEGIN{sum = 0} {OFS = "\t"; sum += ($5 - $4 + 1)} END {print a, b, sum}' >> 003.txt
        done                                        ## 统计每一个转录本的长度并写入文件
        rm -f tmp
        echo $num done
done