''' Description: This script is used to onvert U in fasta sequence file to T Date: 2019 Auther: omicsgene '''
from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.SeqRecord import SeqRecord import sys, os, argparse, os.path,re,math,time #参数设置 parser = argparse.ArgumentParser(description='This script is \ used to onvert U in fasta sequence file to T') parser.add_argument('-f','--fasta',help='Please fasta file',required=True) parser.add_argument('-o','--out_dir',help='Please input complete out_put directory path', default = os.getcwd(), required=False)
parser.add_argument('-n','--name',default ='demo_seq',required=False, help='Please specify the output, demo_seq') ################################################################################ #读入参数,初始化路径 args = parser.parse_args() dout='' if os.path.exists(args.out_dir): dout=os.path.abspath(args.out_dir) else: os.mkdir(args.out_dir) dout=os.path.abspath(args.out_dir)
a = [5,7,6,3,4,1,2] b = sorted(a) # 保留原列表 print(b) b = sorted(a,reverse=True) print(b) a = ['Fast', 'Smooth', 'fast', 'isb', 'isa', 'smooth'] b = sorted(a,key=str.lower,reverse=False) print(b)
sort 与 sorted 区别:
调用方式不同: sort 是应用在 list 上的方法,sorted 可以对所有可迭代的对象进行排序操作。
返回对象不同: list 的 sort 方法是在原来的列表上操作,无返回值(None),而函数 sorted 方法返回的是一个新的 list,而不是在原来的基础上进行的操作。
for line in fr: line=line.strip() if not line[0]=="#": tmp=line.split("\t") if tmp[0]=="1" and tmp[2]=="gene" and int(tmp[3])>100000 and int(tmp[3])<500000: geneID=tmp[8].split(";")[0].split("=")[1] #mystr=tmp[0]+"\t"+tmp[3]+"\t"+tmp[4]+"\t"+geneID+"\n" mystr="\t".join([tmp[0],tmp[3],tmp[4],geneID])+"\n" fw.write(mystr) #fw.write(line+"\n") fr.close() fw.close()
The whole figure. The figure keeps track of all the child Axes, a smattering of ‘special’ artists (titles, figure legends, etc), and the canvas。
The top level container Artist is the matplotlib.figure.Figure
This is what you think of as ‘a plot’, it is the region of the image with the data space. A given figure can contain many Axes, but a given Axes object can only be in one Figure.
These are the number-line-like objects. They take care of setting the graph limits and generating the ticks (the marks on the axis) and ticklabels (strings labeling the ticks).
Basically everything you can see on the figure is an artist (even the Figure, Axes, and Axis objects). This includes Text objects, Line2D objects, collection objects, Patch objects … (you get the idea).
Everything in matplotlib is organized in a hierarchy. At the top of the hierarchy is the matplotlib “state-machine environment” which is provided by the matplotlib.pyplot module.
#类定义 class people: #定义属性 name = '' age = 0 #定义私有属性,私有属性在类外部无法直接进行访问 __weight = 0 #定义构造方法 def __init__(self,n,a,w): self.name = n self.age = a self.__weight = w #定义类方法 def speak(self): print("%s speak: I am %d years old." %(self.name,self.age)) #单继承示例 class student(people): grade = '' def __init__(self,n,a,w,g): #调用父类的构函 people.__init__(self,n,a,w) self.grade = g #覆写父类的方法 def speak(self): print("%s speak: I am %d years old, I am in %d grade of primary school."%(self.name,self.age,self.grade)) s = student('ken',10,60,3) s.speak()
def das_auto(make="Volkswagen", model="Magotan", year=2019): '''this function is used to create Volkswagen car''' my_car=Car("Volkswagen","Magotan",2019) return my_car
#类的定义 class Car: '''this class define a car ''' #类属性 共有属性 wheels=4 #构造方法 def __init__(self, make, model, year): #成员属性 self.make = make self.model = model self.year = year self.orometer_reading = 0 #类方法 def get_description(self): long_name = str(self.year) + ' ' + self.make + ' ' + self.model+" "+str(self.wheels) return long_name def read_odometer(self): print("This car has "+ str(self.orometer_reading) + " miles on it") def update_orometer(self,miles): if miles >= self.orometer_reading: self.orometer_reading = miles else: print("You can'troll back an odometer") def increase(self,miles): self.orometer_reading +=miles
import os os.chdir("D:\Python_script") cwd = os.getcwd() for dir_path, dir_names, file_names in os.walk(cwd): for file_name in file_names: p=os.path.join(dir_path,file_name) print(p) for dir_name in dir_names: p=os.path.join(dir_path,dir_name) print(p)
for line in fr: if re.match("#",line): continue tmp=re.split("\t",line) if tmp[2] == "gene": mobj=re.search("ID=gene:([^;]+)",tmp[8]) #捕获基因ID if mobj: fw.write("\t".join([tmp[0],tmp[3],tmp[4],tmp[6],mobj.group(1)])+"\n") fr.close() fw.close()
5 Python biopython包处理生物数据
5.1 BioPython包
Blast output – both from standalone and WWW Blast
PubMed and Medline
ExPASy files, like Enzyme and Prosite
SCOP, including ‘dom’ and ‘lin’ files
NCBI – Blast, Entrez and PubMed services
ExPASy – Swiss-Prot and Prosite entries, as well as Prosite searches
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
idlist={} #字典用于存储ID列表 f = open("idlist.txt", "r") #打开ID列表文件 for line in f: line=line.strip() idlist[line]=1 f.close()
f_out = open("get.fa", "w") for rec in SeqIO.parse("test.fa", "fasta"): if rec.id in idlist: #判断ID是否存在与ID列表字典中 SeqIO.write(rec,f_out,"fasta") #如果存在写出该序列 f_out.close()
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os import re os.chdir("D:\\python_script")
idlist={} #字典用于存储ID列表以及位置 f = open("id_pos.txt", "r") #打开ID列表文件 for line in f: line=line.strip() tmp=re.split(r"\t",line) idlist[tmp[0]]=[int(tmp[1]),int(tmp[2])] f.close()
f_out = open("get_pos.fa", "w") for rec in SeqIO.parse("test.fa", "fasta"): if rec.id in idlist: #判断ID是否存在与ID列表字典中 start=idlist[rec.id][0] #取得对应ID要截取的起始位置 end=idlist[rec.id][1] #取得对应ID要截取的结束位置 rec_new=SeqRecord(rec.seq[start-1:end],id=rec.id,description=rec.description) #注意生物数据一般是从1作为索引,编程语言一般是从0开始所以要减一 SeqIO.write(rec_new,f_out,"fasta") #写出截取好的序列 f_out.close()
==任务3 答案:==
1 2 3 4 5 6 7 8 9 10 11
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
os.chdir("D:\\python_script") output_handle = open("fq2fa.fa", "w") for rec in SeqIO.parse("test.fq", "fastq"): SeqIO.write(rec, output_handle, "fasta")
==任务4 答案:==
1 2 3 4 5 6 7 8 9 10 11
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
os.chdir("D:\\python_script") output_handle = open("trimed.fq", "w") for rec in SeqIO.parse("test.fq", "fastq"): rec_new=rec[5:] SeqIO.write(rec_new, output_handle, "fastq") output_handle.close()
import numpy as np import os os.chdir("D://python_script//") a, b = np.loadtxt("iris.data", delimiter=',', usecols = [0, 1],unpack=True) print(a) print(b)
#类定义 class people: #定义属性 name = '' age = 0 #定义私有属性,私有属性在类外部无法直接进行访问 __weight = 0 #定义构造方法 def __init__(self,n,a,w): self.name = n self.age = a self.__weight = w #定义类方法 def speak(self): print("%s speak: I am %d years old." %(self.name,self.age)) #单继承示例 class student(people): grade = '' def __init__(self,n,a,w,g): #调用父类的构函 people.__init__(self,n,a,w) self.grade = g #覆写父类的方法 def speak(self): print("%s speak: I am %d years old, I am in %d grade of primary school."%(self.name,self.age,self.grade)) s = student('ken',10,60,3) s.speak()
def das_auto(make="Volkswagen", model="Magotan", year=2019): '''this function is used to create Volkswagen car''' my_car=Car("Volkswagen","Magotan",2019) return my_car
#类的定义 class Car: '''this class define a car ''' #类属性 共有属性 wheels=4 #构造方法 def __init__(self, make, model, year): #成员属性 self.make = make self.model = model self.year = year self.orometer_reading = 0 #类方法 def get_description(self): long_name = str(self.year) + ' ' + self.make + ' ' + self.model+" "+str(self.wheels) return long_name def read_odometer(self): print("This car has "+ str(self.orometer_reading) + " miles on it") def update_orometer(self,miles): if miles >= self.orometer_reading: self.orometer_reading = miles else: print("You can'troll back an odometer") def increase(self,miles): self.orometer_reading +=miles
import os os.chdir("D:\Python_script") cwd = os.getcwd() for dir_path, dir_names, file_names in os.walk(cwd): for file_name in file_names: p=os.path.join(dir_path,file_name) print(p) for dir_name in dir_names: p=os.path.join(dir_path,dir_name) print(p)
for line in fr: if re.match("#",line): continue tmp=re.split("\t",line) if tmp[2] == "gene": mobj=re.search("ID=gene:([^;]+)",tmp[8]) #捕获基因ID if mobj: fw.write("\t".join([tmp[0],tmp[3],tmp[4],tmp[6],mobj.group(1)])+"\n") fr.close() fw.close()
5 Python biopython包处理生物数据
5.1 BioPython包
Blast output – both from standalone and WWW Blast
PubMed and Medline
ExPASy files, like Enzyme and Prosite
SCOP, including ‘dom’ and ‘lin’ files
NCBI – Blast, Entrez and PubMed services
ExPASy – Swiss-Prot and Prosite entries, as well as Prosite searches
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
idlist={} #字典用于存储ID列表 f = open("idlist.txt", "r") #打开ID列表文件 for line in f: line=line.strip() idlist[line]=1 f.close()
f_out = open("get.fa", "w") for rec in SeqIO.parse("test.fa", "fasta"): if rec.id in idlist: #判断ID是否存在与ID列表字典中 SeqIO.write(rec,f_out,"fasta") #如果存在写出该序列 f_out.close()
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os import re os.chdir("D:\\python_script")
idlist={} #字典用于存储ID列表以及位置 f = open("id_pos.txt", "r") #打开ID列表文件 for line in f: line=line.strip() tmp=re.split(r"\t",line) idlist[tmp[0]]=[int(tmp[1]),int(tmp[2])] f.close()
f_out = open("get_pos.fa", "w") for rec in SeqIO.parse("test.fa", "fasta"): if rec.id in idlist: #判断ID是否存在与ID列表字典中 start=idlist[rec.id][0] #取得对应ID要截取的起始位置 end=idlist[rec.id][1] #取得对应ID要截取的结束位置 rec_new=SeqRecord(rec.seq[start-1:end],id=rec.id,description=rec.description) #注意生物数据一般是从1作为索引,编程语言一般是从0开始所以要减一 SeqIO.write(rec_new,f_out,"fasta") #写出截取好的序列 f_out.close()
==任务3 答案:==
1 2 3 4 5 6 7 8 9 10 11
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
os.chdir("D:\\python_script") output_handle = open("fq2fa.fa", "w") for rec in SeqIO.parse("test.fq", "fastq"): SeqIO.write(rec, output_handle, "fasta")
==任务4 答案:==
1 2 3 4 5 6 7 8 9 10 11
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
os.chdir("D:\\python_script") output_handle = open("trimed.fq", "w") for rec in SeqIO.parse("test.fq", "fastq"): rec_new=rec[5:] SeqIO.write(rec_new, output_handle, "fastq") output_handle.close()
import numpy as np import os os.chdir("D://python_script//") a, b = np.loadtxt("iris.data", delimiter=',', usecols = [0, 1],unpack=True) print(a) print(b)
''' Description: This script is used to onvert U in fasta sequence file to T Date: 2019 Auther: omicsgene '''
from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.SeqRecord import SeqRecord import sys, os, argparse, os.path,re,math,time #参数设置 parser = argparse.ArgumentParser(description='This script is \ used to onvert U in fasta sequence file to T') parser.add_argument('-f','--fasta',help='Please fasta file',required=True) parser.add_argument('-o','--out_dir',help='Please input complete out_put directory path', default = os.getcwd(), required=False)
parser.add_argument('-n','--name',default ='demo_seq',required=False, help='Please specify the output, demo_seq') ################################################################################ #读入参数,初始化路径 args = parser.parse_args() dout='' if os.path.exists(args.out_dir): dout=os.path.abspath(args.out_dir) else: os.mkdir(args.out_dir) dout=os.path.abspath(args.out_dir)
a = [5,7,6,3,4,1,2] b = sorted(a) # 保留原列表 print(b) b = sorted(a,reverse=True) print(b) a = ['Fast', 'Smooth', 'fast', 'isb', 'isa', 'smooth'] b = sorted(a,key=str.lower,reverse=False) print(b)
sort 与 sorted 区别:
调用方式不同: sort 是应用在 list 上的方法,sorted 可以对所有可迭代的对象进行排序操作。
返回对象不同: list 的 sort 方法是在原来的列表上操作,无返回值(None),而函数 sorted 方法返回的是一个新的 list,而不是在原来的基础上进行的操作。
for line in fr: line=line.strip() if not line[0]=="#": tmp=line.split("\t") if tmp[0]=="1" and tmp[2]=="gene" and int(tmp[3])>100000 and int(tmp[3])<500000: geneID=tmp[8].split(";")[0].split("=")[1] #mystr=tmp[0]+"\t"+tmp[3]+"\t"+tmp[4]+"\t"+geneID+"\n" mystr="\t".join([tmp[0],tmp[3],tmp[4],geneID])+"\n" fw.write(mystr) #fw.write(line+"\n") fr.close() fw.close()
The whole figure. The figure keeps track of all the child Axes, a smattering of ‘special’ artists (titles, figure legends, etc), and the canvas。
The top level container Artist is the matplotlib.figure.Figure
This is what you think of as ‘a plot’, it is the region of the image with the data space. A given figure can contain many Axes, but a given Axes object can only be in one Figure.
These are the number-line-like objects. They take care of setting the graph limits and generating the ticks (the marks on the axis) and ticklabels (strings labeling the ticks).
Basically everything you can see on the figure is an artist (even the Figure, Axes, and Axis objects). This includes Text objects, Line2D objects, collection objects, Patch objects … (you get the idea).
Everything in matplotlib is organized in a hierarchy. At the top of the hierarchy is the matplotlib “state-machine environment” which is provided by the matplotlib.pyplot module.
''' Description: This script is used to onvert U in fasta sequence file to T Date: 2019 Auther: omicsgene '''
from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.SeqRecord import SeqRecord import sys, os, argparse, os.path,re,math,time #参数设置 parser = argparse.ArgumentParser(description='This script is \ used to onvert U in fasta sequence file to T') parser.add_argument('-f','--fasta',help='Please fasta file',required=True) parser.add_argument('-o','--out_dir',help='Please input complete out_put directory path', default = os.getcwd(), required=False)
parser.add_argument('-n','--name',default ='demo_seq',required=False, help='Please specify the output, demo_seq') ################################################################################ #读入参数,初始化路径 args = parser.parse_args() dout='' if os.path.exists(args.out_dir): dout=os.path.abspath(args.out_dir) else: os.mkdir(args.out_dir) dout=os.path.abspath(args.out_dir)
a = [5,7,6,3,4,1,2] b = sorted(a) # 保留原列表 print(b) b = sorted(a,reverse=True) print(b) a = ['Fast', 'Smooth', 'fast', 'isb', 'isa', 'smooth'] b = sorted(a,key=str.lower,reverse=False) print(b)
sort 与 sorted 区别:
调用方式不同: sort 是应用在 list 上的方法,sorted 可以对所有可迭代的对象进行排序操作。
返回对象不同: list 的 sort 方法是在原来的列表上操作,无返回值(None),而函数 sorted 方法返回的是一个新的 list,而不是在原来的基础上进行的操作。
for line in fr: line=line.strip() if not line[0]=="#": tmp=line.split("\t") if tmp[0]=="1" and tmp[2]=="gene" and int(tmp[3])>100000 and int(tmp[3])<500000: geneID=tmp[8].split(";")[0].split("=")[1] #mystr=tmp[0]+"\t"+tmp[3]+"\t"+tmp[4]+"\t"+geneID+"\n" mystr="\t".join([tmp[0],tmp[3],tmp[4],geneID])+"\n" fw.write(mystr) #fw.write(line+"\n") fr.close() fw.close()
#类定义 class people: #定义属性 name = '' age = 0 #定义私有属性,私有属性在类外部无法直接进行访问 __weight = 0 #定义构造方法 def __init__(self,n,a,w): self.name = n self.age = a self.__weight = w #定义类方法 def speak(self): print("%s speak: I am %d years old." %(self.name,self.age)) #单继承示例 class student(people): grade = '' def __init__(self,n,a,w,g): #调用父类的构函 people.__init__(self,n,a,w) self.grade = g #覆写父类的方法 def speak(self): print("%s speak: I am %d years old, I am in %d grade of primary school."%(self.name,self.age,self.grade)) s = student('ken',10,60,3) s.speak()
def das_auto(make="Volkswagen", model="Magotan", year=2019): '''this function is used to create Volkswagen car''' my_car=Car("Volkswagen","Magotan",2019) return my_car
#类的定义 class Car: '''this class define a car ''' #类属性 共有属性 wheels=4 #构造方法 def __init__(self, make, model, year): #成员属性 self.make = make self.model = model self.year = year self.orometer_reading = 0 #类方法 def get_description(self): long_name = str(self.year) + ' ' + self.make + ' ' + self.model+" "+str(self.wheels) return long_name def read_odometer(self): print("This car has "+ str(self.orometer_reading) + " miles on it") def update_orometer(self,miles): if miles >= self.orometer_reading: self.orometer_reading = miles else: print("You can'troll back an odometer") def increase(self,miles): self.orometer_reading +=miles
import os os.chdir("D:\Python_script") cwd = os.getcwd() for dir_path, dir_names, file_names in os.walk(cwd): for file_name in file_names: p=os.path.join(dir_path,file_name) print(p) for dir_name in dir_names: p=os.path.join(dir_path,dir_name) print(p)
for line in fr: if re.match("#",line): continue tmp=re.split("\t",line) if tmp[2] == "gene": mobj=re.search("ID=gene:([^;]+)",tmp[8]) #捕获基因ID if mobj: fw.write("\t".join([tmp[0],tmp[3],tmp[4],tmp[6],mobj.group(1)])+"\n") fr.close() fw.close()
5 Python biopython包处理生物数据
5.1 BioPython包
Blast output – both from standalone and WWW Blast
PubMed and Medline
ExPASy files, like Enzyme and Prosite
SCOP, including ‘dom’ and ‘lin’ files
NCBI – Blast, Entrez and PubMed services
ExPASy – Swiss-Prot and Prosite entries, as well as Prosite searches
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
idlist={} #字典用于存储ID列表 f = open("idlist.txt", "r") #打开ID列表文件 for line in f: line=line.strip() idlist[line]=1 f.close()
f_out = open("get.fa", "w") for rec in SeqIO.parse("test.fa", "fasta"): if rec.id in idlist: #判断ID是否存在与ID列表字典中 SeqIO.write(rec,f_out,"fasta") #如果存在写出该序列 f_out.close()
==任务2 答案:==
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os import re os.chdir("D:\\python_script")
idlist={} #字典用于存储ID列表以及位置 f = open("id_pos.txt", "r") #打开ID列表文件 for line in f: line=line.strip() tmp=re.split(r"\t",line) idlist[tmp[0]]=[int(tmp[1]),int(tmp[2])] f.close()
f_out = open("get_pos.fa", "w") for rec in SeqIO.parse("test.fa", "fasta"): if rec.id in idlist: #判断ID是否存在与ID列表字典中 start=idlist[rec.id][0] #取得对应ID要截取的起始位置 end=idlist[rec.id][1] #取得对应ID要截取的结束位置 rec_new=SeqRecord(rec.seq[start-1:end],id=rec.id,description=rec.description) #注意生物数据一般是从1作为索引,编程语言一般是从0开始所以要减一 SeqIO.write(rec_new,f_out,"fasta") #写出截取好的序列 f_out.close()
==任务3 答案:==
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
os.chdir("D:\\python_script") output_handle = open("fq2fa.fa", "w") for rec in SeqIO.parse("test.fq", "fastq"): SeqIO.write(rec, output_handle, "fasta")
==任务4 答案:==
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO import os
os.chdir("D:\\python_script") output_handle = open("trimed.fq", "w") for rec in SeqIO.parse("test.fq", "fastq"): rec_new=rec[5:] SeqIO.write(rec_new, output_handle, "fastq") output_handle.close()
import numpy as np import os os.chdir("D://python_script//") a, b = np.loadtxt("iris.data", delimiter=',', usecols = [0, 1],unpack=True) print(a) print(b)
The whole figure. The figure keeps track of all the child Axes, a smattering of ‘special’ artists (titles, figure legends, etc), and the canvas。
The top level container Artist is the matplotlib.figure.Figure
This is what you think of as ‘a plot’, it is the region of the image with the data space. A given figure can contain many Axes, but a given Axes object can only be in one Figure.
These are the number-line-like objects. They take care of setting the graph limits and generating the ticks (the marks on the axis) and ticklabels (strings labeling the ticks).
Basically everything you can see on the figure is an artist (even the Figure, Axes, and Axis objects). This includes Text objects, Line2D objects, collection objects, Patch objects … (you get the idea).
Everything in matplotlib is organized in a hierarchy. At the top of the hierarchy is the matplotlib “state-machine environment” which is provided by the matplotlib.pyplot module.
我们使用matplotlib.pyplot 里面的函数来创建绘图,绘图主要有以下三步:
######################################### #载入包,模块等等 ######################################### import matplotlib.pyplot as plt import numpy as np