snp基因型的转换、与甲基化位点表达量meQTL分析、snp_target对绘图

2020-04-02 14:00:13 阅读：265 来源： 互联网

标签：target format 甲基化 snp output meQTL my array

meQTL.pl

# 导入 -> 系统 package
$|=1;   
use warnings;
use strict;
use Encode qw/decode/;
use File::Spec;
use Getopt::Long;
use Statistics::R;
use Excel::Writer::XLSX;


# 定义 -> 常量
use constant SCRIPTDIR => (File::Spec->splitpath(File::Spec->rel2abs($0)))[1];
use constant PWD => $ENV{"PWD"};
my $scriptdir = SCRIPTDIR;
my $matrix_meQTL_r = "$scriptdir/matrix_meQTL.r";
my $boxplot_r = "$scriptdir/plot_boxplot.r";
my $Rscript = "/home/*/software/r/3.5.1/bin/Rscript";

# 检测 -> 脚本输入
my ($snp_genotype, $rs_ref_alt, $methyl_expression, $number, $output_path, $only, $skip, $list, $if_help);
GetOptions(
    "only=s"           => \$only,
    "skip=s"           => \$skip,
    "snp_genotype|g=s" => \$snp_genotype,
    "rs_ref_alt|r=s"   => \$rs_ref_alt,
    "methyl_expression|m=s" => \$methyl_expression,    
    "number|n=s"       => \$number,
    "output_path|o=s"  => \$output_path,
    "help|h"           => \$if_help,
    "list!"            => \$list
);

$number = 10 if (not defined $number); #默认绘制10个snp_target对；

 
if ($list) {
my $help =<<EOF;

1 snp基因型转换
2 meQTL计算
3 绘图
EOF
print qq{$help\n};
exit;    
}

die help() if(defined $if_help or (not defined $snp_genotype or not defined $rs_ref_alt or not defined $output_path));

my @run_steps = ();
if ($only) {
    foreach my $x (split /,/, $only) {
        push @run_steps, $x;
    }
} else {
    my @skips = split /,/, $skip if defined ($skip);
    foreach my $x (1..3) {
        next if $x ~~ @skips;
        push @run_steps, $x;
    }
}

####检查$snp_genotype(基因分型文件)和$methyl_expression(甲基化位点表达量)两个文件的样本名称和顺序是否一致
check_samples($snp_genotype, $methyl_expression);

#################################################主程序##############################################

####################################################################
####step 1:snp_genotype基因型转换
####################################################################
my $tmpdir = "$output_path/tmp"; 
mkdir $tmpdir if not -d $tmpdir;
my $snp_type = "$output_path/tmp/snp_type.txt";
if (1 ~~ @run_steps) {
    get_snp_type($snp_genotype, $rs_ref_alt, $snp_type);

    #####基因型转换前、转换后、突变位点 ==》xlsx表格输出
    my $report_snp_genotype = "$output_path/snp_genotype.xlsx";
    my $workbook   = Excel::Writer::XLSX->new($report_snp_genotype);
    my %format = format_excel($workbook); #获取Excel格式内容 
    ####sheet1:基因型转换前
    my $txt1       = "$snp_genotype";
    my $worksheet1 = $workbook->add_worksheet("Original_SNP_Result");
    report_xlsx($txt1, $worksheet1, \%format);
    ####sheet2:基因型转换后
    my $txt2       = "$snp_type";
    my $worksheet2 = $workbook->add_worksheet("Transformed_SNP_Result");
    report_xlsx($txt2, $worksheet2, \%format);
    ####sheet3:突变位点
    my $txt3       = "$rs_ref_alt";
    my $worksheet3 = $workbook->add_worksheet("Gene_Mutation_Site");
    report_xlsx($txt3, $worksheet3, \%format);
    $workbook->close();

}

####################################################################
####step 2:meQTL_cis and meQTL_trans 结果 ==》xlsx表格输出
####################################################################
if (2 ~~ @run_steps) {
    system qq{$Rscript $matrix_meQTL_r -o $tmpdir/meQTL_output.txt  -m $methyl_expression  -s $tmpdir/snp_type.txt};
    system qq{sed -i "s/gene/target/" $tmpdir/meQTL_output.txt};
    system qq{sed -i "s/-/_/" $tmpdir/meQTL_output.txt};
    my $report_meQTL = "$output_path/meQTL_output.xlsx";
    my $workbook_meQTL = Excel::Writer::XLSX->new($report_meQTL);
    my %format_meQTL = format_excel($workbook_meQTL);

    ####sheet1:meQTL_output
    my $txt_meQTL = "$tmpdir/meQTL_output.txt";
    my $worksheet_meQTL = $workbook_meQTL->add_worksheet("meQTL_Output");
    report_xlsx($txt_meQTL, $worksheet_meQTL, \%format_meQTL);
    ####sheet2:readme
    my $txt_readme = "$scriptdir/readme.txt";
    ReadMe($workbook_meQTL,\%format_meQTL,"$txt_readme");
    $workbook_meQTL->close();
}

####################################################################
####step 3:对p值最小的前n个结果绘制boxplot图，横坐标是snp分型，纵坐标是表型
####################################################################
if (3 ~~ @run_steps) {
    my $meQTL_output = "$tmpdir/meQTL_output.txt";
    my $count = $number + 1;
    my $extract_meQTL_output = "$tmpdir/extract_meQTL_output.txt";
    system qq{head -n $count $meQTL_output >$extract_meQTL_output};
    #画图
    system qq{$Rscript $boxplot_r -g $snp_genotype -m $methyl_expression -i $extract_meQTL_output -o $output_path/boxplot.pdf};
}

##################################################子程序##############################
sub check_samples{
    my $snp_genotype = shift @_;
    my $methyl_expression = shift @_;
    
    open FILE, $snp_genotype ;
    my $head = <FILE>;
       $head =~ s/[\r\n]//g;
    my @array0 = split /\t/, $head, 2;
    my @array = split /\t/, $array0[1];

    open FILE2, $methyl_expression;
    my $head2 = <FILE2>;
       $head2 =~ s/[\r\n]//g;
    my @array1 = split /\t/, $head2, 2;
    my @array2 = split /\t/, $array1[1];
    
    die "\n[Error]: please check '$snp_genotype $methyl_expression' samples name and order！\n\n" if (not @array ~~ @array2);
    
    close FILE;
    close FILE2;
}

sub get_snp_type{
    my $snp_genotype = shift @_;
    my $rs_ref_alt = shift @_;
    my $snp_type = shift @_;
    
    open RS, $rs_ref_alt or die "The rs_ref_alt file doesn't exist!\n";
    my %hashRef;
    my %hashAlt;
    while (<RS>) {
        $_ =~ s/[\r\n]//g;
        next if not /^rs/;
        my @array = split /\t/;
        $hashRef{$array[0]} = $array[1]; #$hash{"rs"} = "ref";
        $hashAlt{$array[0]}{$array[1]} = $array[2]; #$hashAlt{"rs"}{"ref"} = "alt";
    }
    
    open GENOTYPE, $snp_genotype or die "The snp_genotype file doesn't exist!\n";
    open SAVE, ">$snp_type";
    my $head = <GENOTYPE>;
       $head =~ s/[\r\n]//g;
    print SAVE "$head\n";
    
    while (<GENOTYPE>) {
        $_ =~ s/[\r\n]//g;
        my @array = split /\t/;
        my $ref = $hashRef{$array[0]};
        my $alt = $hashAlt{$array[0]}{$ref};
        foreach my $col (1..$#array){
            $array[$col] = exists $array[$col] ? $array[$col] : "";            
            $array[$col] = 0 if ($array[$col] eq "$ref/$ref");
            $array[$col] = 1 if ($array[$col] eq "$ref/$alt" or ($array[$col] eq "$alt/$ref"));
            $array[$col] = 2 if ($array[$col] eq "$alt/$alt");
        }
        # 输出
        print SAVE (join "\t", @array) . "\n";    
    }    
    close RS;
    close GENOTYPE;
    close SAVE;     
}

sub report_xlsx{
    my $txt       = shift;
    my $worksheet = shift;
    my $format      = shift;

    my $row = 0;
    open TXT, $txt or die "Can't open $txt!\n";
    while (<TXT>) {
        chomp;
        my @arr = split /\t/;
        if($row == 0){
            $worksheet->write_row( 0, 0, \@arr, $format->{"title"});
        }else{
            $worksheet->write_row( $row, 0, \@arr, $format->{"normal"});
        }
        $row++;
    }
    close TXT;
}

sub format_excel{
    my $workbook = shift;
    my %format = ();
    #第一行（标题设置:黄色、上下左右居中）
    $format{"title"} = $workbook->add_format();
    $format{"title"}->set_bg_color('yellow');
    $format{"title"}->set_align('center');
    $format{"title"}->set_align('vcenter');
    $format{"title"}->set_border();
    $format{"title"}->set_font("Times New Roman");
    #其它行（上下左右居中）
    $format{"normal"} = $workbook->add_format();
    $format{"normal"}->set_align('center');
    $format{"normal"}->set_align('vcenter');
    $format{"normal"}->set_border();
    $format{"normal"}->set_font("Times New Roman");

    return %format;
}

sub ReadMe{
    my ($workbook, $format, $file)=@_;
    my $sheet= $workbook -> add_worksheet("Read Me");
    $sheet->set_row(0, 50);
    $sheet->set_column('A:A', 35);
    $sheet->set_column('B:B', 25);
    $sheet->set_column('C:C', 110);
    my $row = 0;
    open FILE, $file;
    while (<FILE>){
        $_ = ~ s/\^/\n/g;
        my @split_line = split /\t/,$_;
        my $col = 0;
        foreach (@split_line)
        {
            my $text = decode("utf8", $_);
            if($row == 0 and $col >= 0){
                $sheet -> write($row, $col, $text, $format->{"title"});
            }else{
               $sheet -> write($row, $col, $text, $format->{"normal"});
            }
            $col++;
        }
        $row++;
    }
    close FILE;
}


sub help{
    my $info = " Program: meQTL_methylTarget's SNP type transportation, meQTL analysis and plot
 Version: 2020-02-28
 Contact: *** Usage: perl ".(File::Spec->splitpath(File::Spec->rel2abs($0)))[2]." [options] 


 Options:
         --snp_genotype/-g      基因分型文件,第一行为表头，分别为snpid编号，样本名1，样本名2...每一行表示一个snp位点，每一列表示一个样本的基因分型。注：snp_genotype与methyl_expression文件中样本顺序需保持一致。
         --rs_ref_alt/-r        SNP突变位点信息, 第一行为表头，三列内容：分别为snpid编号,ref,alt
         --methyl_expression/-m 甲基化位点的表达量，第一行为表头，分别为甲基化位点，样本名1，样本名1，样本名2...每一行表示一个甲基化位点，每一列表示一个样本的甲基化值。注：snp_genotype与methyl_expression文件中样本顺序需保持一致。
         --number/-n            绘制snp_target_boxplot关系对个数
         --output_path/-o       输出文件夹
         --help/-h              查看帮助文档
         --only                 only steps
         --skip                 skip steps
         --list                 分析列表
    \n";
    return $info;
}

matrix_meQTL.r

#!/home/genesky/software/r/3.5.1/bin/Rscript
library(docopt)
"Usage: MatrixEQTL_CIS.r -o <file>  -m <file>  -s <file> [-c <file>]
Options:
    -o, --output_file <file>        the eQTL result
    -m , --methyl_file <file>        the input methyltarget expression file
    -s , --SNV_file <file>            the input SNV type file
    -c , --cov_file <file>             the covariates file [default: xlsx]" -> doc

opts            <- docopt(doc, version = 'Program : EQTL analysis based on methyltarget and SNV data\n')
methyl_input    <- opts$methyl_file
SNV_input        <- opts$SNV_file
output_file_name <-opts$output_file
cov_input         <- opts$cov_file

#########
.libPaths("/home/*/software/r/3.5.1/lib64/R/library/")
library(MatrixEQTL)

useModel = modelLINEAR

# Genotype file name
SNP_file_name <- SNV_input

# Gene expression file name
expression_file_name <- methyl_input

# Covariates file name
covariates_file_name <- cov_input

# Output file name
output_file_name <- output_file_name

# Set to numeric() for identity
errorCovariance = numeric()

cisDist = 1e6

## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t"      # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1          # one row of column labels
snps$fileSkipColumns = 1       # one column of row labels
snps$fileSliceSize = 2000      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)

## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)

## Load covariates
cvrt = SlicedData$new()
if(file.exists(covariates_file_name)) {
    cvrt$fileDelimiter = "\t"      # the TAB character
    cvrt$fileOmitCharacters = "NA" # denote missing values;
    cvrt$fileSkipRows = 1          # one row of column labels
    cvrt$fileSkipColumns = 1       # one column of row labels
    if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name)
    }
}

## Run the analysis
#snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
#genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name,
    pvOutputThreshold = 1,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    cisDist = cisDist,
    pvalue.hist = TRUE,
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)

readme.txt

标题     说明
SNV位点    snp位点
target    甲基化片段位点
beta    线性回归分析系数
t_stat    线性回归分析t值
p_value    线性回归分析p值（小于0.05，标注橘黄色）
FDR    线性回归分析修正后的p值

plot_boxplot.r

#!/home/genesky/software/r/3.5.1/bin/Rscript

.libPaths("/home/*/software/r/3.5.1/lib64/R/library/")
library(docopt)
"Usage: boxplot.r  -g <file> -m <file> -i <file>  -o <file> [--width <int> --y_name <string>]
Options:
   -g, --genotype <file>                   输入文件，基因分型文件，第一行为表头，分别为snpid编号，样本名1，样本名2...每一行表示一个snp位点，每一列表示一个样本的基因分型。注：snp_genotype与methyl_expression文件中样本顺序需保持一致。
   -m, --methyl_expression <file>          输入文件，甲基化位点的表达量，第一行为表头，分别为甲基化位点，样本名1，样本名1，样本名2...每一行表示一个甲基化位点，每一列表示一个样本的甲基化值。注：snp_genotype与methyl_expression文件中样本顺序需保持一致。
   -i, --meQTL_result <file>               输入文件，meQTL_output数据
   -o, --outdir <file>                     输出路径
   --y_name <string>                       y轴名称 [default: methyl_expression]
   --width <int>                           pdf绘图时，每一个分组的宽度 [default: 2.5]" -> doc

opts                     <- docopt(doc, version = '点图绘制\n')
genotype                 <- opts$genotype
methyl_expression        <- opts$methyl_expression
meQTL_result             <- opts$meQTL_result
outdir                   <- opts$outdir
y_name                   <- opts$y_name
width                    <- opts$width

# 加载R包
library(ggpubr, quietly = TRUE)

#读入snp分型数据
message("loading data : ", genotype)
data_snp_genotype <- read.table(genotype, sep='\t', header = F, check.names=F, stringsAsFactors=F)
data_snp_genotype <- t(data_snp_genotype) #转置
data_snp_genotype <- data_snp_genotype[,-1] #删除第一列样本名称
colnames(data_snp_genotype) <- data_snp_genotype[1,] #第一行是标题行


#读入甲基化位点表达量数据
message("loading data : ", methyl_expression)
data_methyl_expression <- read.table(methyl_expression, sep='\t', header = F, check.names=F, stringsAsFactors=F)
data_methyl_expression <- t(data_methyl_expression) #转置
data_methyl_expression <- data_methyl_expression[,-1] #删除第一列样本名称
colnames(data_methyl_expression) <- data_methyl_expression[1,] #第一行是标题行


#读入meQTL_output数据
message("loading data : ", meQTL_result)
data_meQTL_result <- read.table(meQTL_result, sep='\t', header = T, check.names=F, stringsAsFactors=F)
compare <- data_meQTL_result[,c(1,2,5)] #提取snp_target对、p_value


pdf(outdir, width = 3 * as.numeric(width), height = 7)
for(row in 1:nrow(compare))
{   
    snp = compare[row, 1]
    target = compare[row, 2]
    p_value = compare[row, 3]
    p_value <- format(p_value, scientific=TRUE, digit=3)

    data_plot <- cbind(data_snp_genotype[,snp], data_methyl_expression[,target]) #提取绘图数据
    data_plot <- as.data.frame(data_plot) 
    data_plot <- data_plot[-1,] #删除标题行，否则非数值的标题行在as.numeric()时报错“强制改变过程中产生了NA”
    data_plot[,2] <- as.character(data_plot[,2]) #factor -> character
    data_plot[,2] <- as.numeric(data_plot[,2])   #character ->numeric
    colnames(data_plot) = c(snp, target)
    rownames(data_plot) <- NULL  
    data_plot <- data_plot[order(data_plot[,1],decreasing=T),] #根据genotype排序
    data_plot = data_plot[complete.cases(data_plot), ]  # 去掉缺失
    
    legend_P <- paste("p_value","=",p_value,sep = "")
    plot_title = colnames(data_plot)[2]

    colnames(data_plot)[2] = 'GENE'  # 变量不能以数字开头，故替换名称

    p <- ggboxplot(data_plot, x=snp, y='GENE', color = snp, 
         palette = "npg", #杂志nature的配色
         # palette = "aaas", #杂志Science的配色
         # palette = "jco", #按jco杂志配色方案
         add = c("jitter"),  # 增加点
         add.params = list(color = snp),
         outlier.size = -1, # 隐藏离群点，否则离群点会在图上出现两次（boxplot绘制离群点，jitter再绘制所有点，从而使离群点绘制两次）
         )+ xlab(snp) + ylab(y_name) +  guides(fill = FALSE, color = FALSE) + theme(plot.title = element_text(hjust = 0.5)) + labs(title = plot_title, subtitle = legend_P)  # 去掉legend,标题居中
        print(p)    
}

dev.off()

标签：target,format,甲基化,snp,output,meQTL,my,array
来源： https://www.cnblogs.com/dongxj1994/p/12619375.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

snp基因型的转换、与甲基化位点表达量meQTL分析、snp_target对绘图