mamba create -n 3Dmapper python=3.7 -y
conda activate 3Dmapper
git clone https://github.com/vicruiser/3Dmapper.git
cd 3Dmapper
pip install .
sh r_dependencies.sh
> makestructuraldb -h
usage: makestructuraldb [-h] --pdb <String> [<String> ...] --blast_db <String>
[-o <String>] [-d <String>] [-t <String>] [-e <float>]
[--pident <float>] [-c <float>]
[--interaction <String>] [--biolip] [-p] [-j <int>]
____ _____
|___ \| __ \
__) | | | |_ __ ___ __ _ _ __ _ __ ___ _ __
|__ <| | | | '_ ` _ \ / _` | '_ \| '_ \ / _ \ '__|
___) | |__| | | | | | | (_| | |_) | |_) | __/ |
|____/|_____/|_| |_| |_|\__,_| .__/| .__/ \___|_|
| | | |
|_| |_|
optional arguments:
-h, --help show this help message and exit
--pdb <String> [<String> ...]
PDB file path
--blast_db <String> proteome files path (output of makeblastdb)
-o <String>, --out <String>
output directory
-d <String>, --dist <String>
inter-residue distance threshold in angstroms (5 by
-t <String>, --int-type <String>
interface definition. Options are: 'noh' (by default)
to calculate distance considering heavy atoms only;
'h' to compute hydrogen bonds; 'calpha' to compute CA-
CA distance; 'cbeta' to measure distances between CB-
CB (CA in the case of Glycine); 'sidechain' consider
atoms only from sidechains; 'backbone' to calculate
distances between atoms in the backbone only; or 'all'
to calcule interfaces between any possible atom
-e <float>, --evalue <float>
e-value threshold in BLAST. Default is 1e-7.
--pident <float> percent identity threshold between query (pdb chain)
and hit (protein) sequences. Default is 20 percent
-c <float>, --coverage <float>
percent coverage threshold of the protein sequence
(how much of the protein sequence is covered by the
PDB sequence). Default is 0 percent
--interaction <String>
Interaction type: 'protein', 'ligand', 'nucleic',
combination of the previous separated by space, or
'all' (by default)
--biolip consider BioLiP list
(https://zhanggroup.org/BioLiP/ligand_list) to remove
artifact ligands? Default is False
-p, --parallel Parallelize process
-j <int>, --jobs <int>
number of jobs to run in parallel
| >>>Publication link<<<<< |
| victoria.ruiz.serra@gmail.com |
> makevariantsdb -h
$ makevariantsdb -h
usage: makevariantsdb [-h]
(-vf <String> [<String> ...] | -maf <String> [<String> ...])
[-f] [-o <String>] [-s] [-p] [-j <int>]
____ _____
|___ \| __ \
__) | | | |_ __ ___ __ _ _ __ _ __ ___ _ __
|__ <| | | | '_ ` _ \ / _` | '_ \| '_ \ / _ \ '__|
___) | |__| | | | | | | (_| | |_) | |_) | __/ |
|____/|_____/|_| |_| |_|\__,_| .__/| .__/ \___|_|
| | | |
|_| |_|
optional arguments:
-h, --help show this help message and exit
-vf <String> [<String> ...], --varfile <String> [<String> ...]
input VCF, VEP or VEP-like file(s)
-maf <String> [<String> ...], --maf_file <String> [<String> ...]
input MAF file(s)
-f, --force force to owerwrite? Inactive by default
-o <String>, --out <String>
output directory. Default is current directory.
-s, --sort sort input file to split
-p, --parallel Speed up running time. Depends on GNU Parallel. O.
Tange(2011): GNU Parallel - The Command-Line Power
Tool, login: The USENIX Magazine, February 2011:
-j <int>, --jobs <int>
number of jobs to run in parallel
| >>>Publication link<<<<< |
| victoria.ruiz.serra@gmail.com |
> mapper -h
usage: mapper [-h]
(-pid <String> [<String> ...] | -vid <String> [<String> ...])
-psdb <String> -vdb <String> [-o <String>] --id_mapping <String>
[-i <String> [<String> ...]] [-c <String> [<String> ...]]
[-d <float>] [--pident <int>] [-e <int>] [-f | -a] [-p]
[-j <int>] [-v] [-l] [-csv] [-hdf]
____ _____
|___ \| __ \
__) | | | |_ __ ___ __ _ _ __ _ __ ___ _ __
|__ <| | | | '_ ` _ \ / _` | '_ \| '_ \ / _ \ '__|
___) | |__| | | | | | | (_| | |_) | |_) | __/ |
|____/|_____/|_| |_| |_|\__,_| .__/| .__/ \___|_|
| | | |
|_| |_|
optional arguments:
-h, --help show this help message and exit
-pid <String> [<String> ...], --prot-id <String> [<String> ...]
one or more IDs of protein, transcripts or genes
provided via command line or from a file
-vid <String> [<String> ...], --var-id <String> [<String> ...]
single or list of variants ids provided via command
line or from a file
-psdb <String> interfaces database directory
-vdb <String>, --vardb <String>
variants database directory
-o <String>, --out <String>
output directory
--id_mapping <String>
File that contains the conversion of protein,
transcripts and gene IDs and optionally APPRIS
isoforms IDs.
-i <String> [<String> ...], --isoform <String> [<String> ...]
if available in the ID mapping file, this parameter
can filter by a single or a list of APPRIS isoforms.
The principal isoform is set by default. Options are:
principal1, principal2, ...
-c <String> [<String> ...], --consequence <String> [<String> ...]
filter by variant or position consequence type
-d <float>, --dist <float>
threshold of interface maximum distance allowed in
angstroms. By default, the maximum value will be the
one selected in makeinterfacedb
--pident <int> threshold of sequence identity (percertage)
-e <int>, --evalue <int>
threshold of evalue
-f, --force force to owerwrite? Inactive by default
-a, --append Two or more calls to the program write are able to
append results to the same output file.
-p, --parallel Parallelize process
-j <int>, --jobs <int>
number of jobs to run in parallel
-v, --verbose Print progress.
-l, --location Map all variants and detect their location.
Output format:
-csv Write the mapped data to a CSV file.
-hdf Write the mappedd data to an HDF5 file using HDFStore.
| >>>Publication link<<<<< |
| victoria.ruiz.serra@gmail.com |
> makevisualization -h
____ _____
|___ \| __ \
__) | | | |_ __ ___ __ _ _ __ _ __ ___ _ __
|__ <| | | | '_ ` _ \ / _` | '_ \| '_ \ / _ \ '__|
___) | |__| | | | | | | (_| | |_) | |_) | __/ |
|____/|_____/|_| |_| |_|\__,_| .__/| .__/ \___|_|
| | | |
|_| |_|
| >>>Publication link<<<<< |
| victoria.ruiz.serra@gmail.com |
⡀⠀ Loadingusage: makevisualization [-h] -p <string> [<string> ...] [--pdb_list]
[-i <string>] [-s <string>] [-o <string>]
[-n <string>] [-it <string>] [-l <string>]
[-bg <string>] [-ns] [-mol <string>] [-is <string>]
makechimerax generates ChimeraX scripts of the provided PDB code(s) from the
data contained in the mapped file generated by 3Dmapper.
optional arguments:
-h, --help show this help message and exit
-p <string> [<string> ...], --pdb_code <string> [<string> ...]
PDB code/s to be found within the mapped file. If
assembly is not included it will generate a script for
all mapped assemblies of that PDB code.
--pdb_list Specifies that PDBs provided are in a file, sepparated
by spaces, tabs or new lines.
-i <string>, --interface_positions <string>
File containing the variants mapped to interfaces
generated by 3Dmapper.
-s <string>, --structure_positions <string>
File containing the variants mapped to protein
structure generated by 3Dmapper.
-o <string>, --output <string>
Output folder in which the ChimeraX script/s will be
-n <string>, --name <string>
Base name for the ChimeraX scripts.
-it <string>, --inter_type <string>
If interfaces are available, filter by specified
interaction type (ligand, protein or nucleic).
-l <string>, --lighting <string>
Select lighting option: full, soft or simple.
-bg <string>, --background <string>
Select background option: white or black.
-ns, --no_silhouette Display will not present silhouettes.
-mol <string>, --mol_style <string>
Select option for molecule style: ball, sphere or
-is <string>, --itf_style <string>
Select option for molecule style of the interfaces
displayed: ball, sphere or stick.
-f, --force Force to overwrite?
3Dmapperは、構造データベースとバリアントアノテーションファイルの生成から始まり、バリアントのタンパク質構造へのマッピング、そしてChimeraXを使った結果の可視化でパイプラインは終了する。使用するには、クエリのPDBファイルと問い合わせ先のタンパク質セットのBLAST DBが必要。チュートリアルの流れを確認する。
チュートリアルでは、BLAST+のmakeblastdbコマンドを使ってEnsembl ID:
ENST00000367182 & ENST00000374005のヒト転写産物のタンパク質データベースを作成している(実行後のDBはcloneした3Dmapper/example/1-makestructuraldb/に含まれている)。チュートリアル通りコマンドを実行するなら以下の通り。
#1 change direcotry
cd 3Dmapper/example/1-makestructuraldb/
#2 download human proteome (20,598 seqs)
wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.fasta.gz
gzip -dv UP000005640_9606.fasta.gz
#3 run makeblastdb commnad
makeblastdb -in UP000005640_9606.fasta -dbtype prot -out human_proteome_uniprot
makestructuraldbコマンドを使用する。”--pdb”でPDBファイルあるいはCIF形式ファイルのパスを1行ずつ記載したテキストを指定する。"--blast_db"で1のBLAST DBを指定する。
makestructuraldb --pdb input_pdbs.txt --blast_db human_proteome_blastdb/human_proteome_uniprot --pident 95 -o outdir
--blast_db proteome files path (output of makeblastdb)
--pident percent identity threshold between query (pdb chain) and hit (protein) sequences. Default is 20 percent
-o output directory
cd 3Dmapper/example/2-makevariantsdb/
makevariantsdb -vf variants.vep
-vf input VCF, VEP or VEP-like file(s)
DB/内にファイルが作成される。これらのファイルには、それぞれの転写産物に関連する variants が含まれている。
> ls -lR DB/
cd 3Dmapper/example/3-mapper/
mapper -pid P09769 O15151 -psdb ../1-makestructuraldb/structural_db/structuralDB/ -vdb ../2-makevariantsdb/DBs/varDB/ --id_mapping dict_geneprot_GRCh38_uniprot.txt -csv -a -l -o 3dmapper_results
-pid one or more IDs of protein, transcripts or genes provided via command line or from a file
-psdb interfaces database directory
-vdb variants database directory
--id_mapping File that contains the conversion of protein, transcripts and gene IDs and optionally APPRIS isoforms IDs.
- -o output directory
ls -lR 3dmapper_results/
cd 3Dmapper/example/4-makevisualization/
makevisualization -p list_pdbs.txt --pdb_list -i ../3-mapper/3dmapper_results/csv/InterfacePositions_pident20.0_isoform_all_consequence_all.csv --force -s ../3-mapper/3dmapper_results/csv/StructurePositions_pident20.0_isoform_all_consequence_all.csv -is sphere
3Dmapper: a command line tool for BioBank-scale mapping of variants to protein structures
Victoria Ruiz-Serra, Samuel Valentini, Sergi Madroñero, Alfonso Valencia, Eduard Porta-Pardo
Bioinformatics, Published: 02 April 2024
UCSF Chimera入門