
363 lines
16 KiB

--$Id: scoremat.asn 347837 2011-12-21 15:28:42Z boratyng $
-- ===========================================================================
-- National Center for Biotechnology Information
-- This software/database is a "United States Government Work" under the
-- terms of the United States Copyright Act. It was written as part of
-- the author's official duties as a United States Government employee and
-- thus cannot be copyrighted. This software/database is freely available
-- to the public for use. The National Library of Medicine and the U.S.
-- Government have not placed any restriction on its use or reproduction.
-- Although all reasonable efforts have been taken to ensure the accuracy
-- and reliability of the software and data, the NLM and the U.S.
-- Government do not and cannot warrant the performance or results that
-- may be obtained by using this software or data. The NLM and the U.S.
-- Government disclaim all warranties, express or implied, including
-- warranties of performance, merchantability or fitness for any particular
-- purpose.
-- Please cite the author in any work or product based on this material.
-- ===========================================================================
-- Author: Christiam Camacho
-- File Description:
-- ASN.1 definitions for scoring matrix
-- ===========================================================================
EXPORTS Pssm, PssmIntermediateData, PssmFinalData,
PssmParameters, PssmWithParameters;
IMPORTS Object-id FROM NCBI-General
Seq-entry FROM NCBI-Seqset;
-- a rudimentary block/core-model, to be used with block-based alignment
-- routines and threading
BlockProperty ::= SEQUENCE {
type INTEGER { unassigned (0),
threshold (1), -- score threshold for heuristics
minscore (2), -- observed minimum score in CD
maxscore (3), -- observed maximum score in CD
meanscore (4), -- observed mean score in CD
variance (5), -- observed score variance
name (10), -- just name the block
is-optional(20), -- block may not have to be used
other (255) },
textvalue VisibleString OPTIONAL
CoreBlock ::= SEQUENCE {
start INTEGER, -- begin of block on query
stop INTEGER, -- end of block on query
minstart INTEGER OPTIONAL, -- optional N-terminal extension
maxstop INTEGER OPTIONAL, -- optional C-terminal extension
property SEQUENCE OF BlockProperty OPTIONAL
LoopConstraint ::= SEQUENCE {
minlength INTEGER DEFAULT 0, -- minimum length of unaligned region
maxlength INTEGER DEFAULT 100000 -- maximum length of unaligned region
CoreDef ::= SEQUENCE {
nblocks INTEGER, -- number of core elements/blocks
blocks SEQUENCE OF CoreBlock, -- nblocks locations
loops SEQUENCE OF LoopConstraint, -- (nblocks+1) constraints
isDiscontinuous BOOLEAN OPTIONAL, -- is it a discontinuous domain
insertions SEQUENCE OF INTEGER OPTIONAL -- positions of long insertions
Site-annot ::= SEQUENCE {
startPosition INTEGER, -- location of the annotation,
stopPosition INTEGER, -- start and stop position in the
description VisibleString OPTIONAL, -- holds description or names, that
-- can be used for labels in
-- visualization
type INTEGER OPTIONAL, -- type of the annotated feature,
-- similarly to Align-annot in
-- NCBI-Cdd
aliases SEQUENCE OF VisibleString OPTIONAL, -- additional names for
-- the annotation
motif VisibleString OPTIONAL, -- motif to validate mapping of sites
motifuse INTEGER OPTIONAL -- 0 for validation
-- 1 for motif in seqloc
-- 2 for multiple motifs in seqloc
Site-annot-set ::= SEQUENCE OF Site-annot
-- ===========================================================================
-- PSI-BLAST, formatrpsdb, RPS-BLAST workflow:
-- ===========================================
-- Two possible inputs to PSI-BLAST and formatrpsdb:
-- 1) PssmWithParams where pssm field contains intermediate PSSM data (matrix
-- of frequency ratios)
-- 2) PssmWithParams where pssm field contains final PSSM data (matrix of
-- scores and statistical parameters) - such as written by cddumper
-- In case 1, PSI-BLAST's PSSM engine is invoked to create the PSSM and perform
-- the PSI-BLAST search or build the PSSM to then build the RPS-BLAST database.
-- In case 2, PSI-BLAST's PSSM engine is not invoked and the matrix of scores
-- statistical parameters are used to perform the search in PSI-BLAST and the
-- same data and the data in PssmWithParams::params::rpsdbparams is used to
-- build the PSSM and ultimately the RPS-BLAST database
-- reads ++++++++++++++ writes
-- PssmWithParams ====> + PSI-BLAST + =====> PssmWithParams
-- ++++++++++++++ | ^
-- ^ | |
-- | | |
-- +===========================================+ |
-- | |
-- +===========================================+ |
-- | |
-- reads | |
-- v |
-- +++++++++++++++ writes +++++++++++++++++++++++ |
-- | formatrpsdb | =====> | RPS-BLAST databases | |
-- +++++++++++++++ +++++++++++++++++++++++ |
-- ^ |
-- | |
-- | reads |
-- +++++++++++++ |
-- | RPS-BLAST | |
-- +++++++++++++ |
-- |
-- reads ++++++++++++ writes |
-- Cdd ======> | cddumper | =============================+
-- ++++++++++++
-- ===========================================================================
-- Contains the PSSM's scores and its associated statistical parameters.
-- Dimensions and order in which scores are stored must be the same as that
-- specified in Pssm::numRows, Pssm::numColumns, and Pssm::byrow
PssmFinalData ::= SEQUENCE {
-- PSSM's scores
-- Karlin & Altschul parameter produced during the PSSM's calculation
lambda REAL,
-- Karlin & Altschul parameter produced during the PSSM's calculation
kappa REAL,
-- Karlin & Altschul parameter produced during the PSSM's calculation
-- scaling factor used to obtain more precision when building the PSSM.
-- (i.e.: scores are scaled by this value). By default, PSI-BLAST's PSSM
-- engine generates PSSMs which are not scaled-up, however, if PSI-BLAST is
-- given a PSSM which contains a scaled-up PSSM (indicated by having a
-- scalingFactor greater than 1), then it will scale down the PSSM to
-- perform the initial stages of the search with it.
-- N.B.: When building RPS-BLAST databases, if formatrpsdb is provided
-- scaled-up PSSMs, it will ensure that all PSSMs used to build the
-- RPS-BLAST database are scaled by the same factor (otherwise, RPS-BLAST
-- will silently produce incorrect results).
scalingFactor INTEGER DEFAULT 1,
-- Karlin & Altschul parameter produced during the PSSM's calculation
lambdaUngapped REAL OPTIONAL,
-- Karlin & Altschul parameter produced during the PSSM's calculation
kappaUngapped REAL OPTIONAL,
-- Karlin & Altschul parameter produced during the PSSM's calculation
-- Contains the PSSM's intermediate data used to create the PSSM's scores
-- and statistical parameters. Dimensions and order in which scores are
-- stored must be the same as that specified in Pssm::numRows,
-- Pssm::numColumns, and Pssm::byrow
PssmIntermediateData ::= SEQUENCE {
-- observed residue frequencies (or counts) per position of the PSSM
-- (prior to application of pseudocounts)
-- Weighted observed residue frequencies per position of the PSSM.
-- (N.B.: each position's weights should add up to 1.0).
-- This field corresponds to f_i (f sub i) in equation 2 of
-- Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
-- NOTE: this is needed for diagnostics information only (i.e.:
-- -out_ascii_pssm option in psiblast)
-- PSSM's frequency ratios
-- Information content per position of the PSSM
-- NOTE: this is needed for diagnostics information only (i.e.:
-- -out_ascii_pssm option in psiblast)
-- Relative weight for columns of the PSSM without gaps to pseudocounts
-- NOTE: this is needed for diagnostics information only (i.e.:
-- -out_ascii_pssm option in psiblast)
-- Used in sequence weights computation
-- NOTE: this is needed for diagnostics information only (i.e.:
-- -out_ascii_pssm option in psiblast)
-- Length of the aligned regions per position of the query sequence
-- NOTE: this is needed for diagnostics information only (i.e.:
-- -out_ascii_pssm option in psiblast)
-- Number of matching sequences per position of the PSSM (including the
-- query)
-- NOTE: this is needed for diagnostics information only (i.e.:
-- -out_ascii_pssm option in psiblast)
-- Number of independent observations per position of the PSSM
-- NOTE: this is needed for building CDD database for DELTA-BLAST
-- Position-specific scoring matrix
-- Column indices on the PSSM refer to the positions corresponding to the
-- query/master sequence, i.e. the number of columns (N) is the same
-- as the length of the query/master sequence.
-- Row indices refer to individual amino acid types, i.e. the number of
-- rows (M) is the same as the number of different residues in the
-- alphabet we use. Consequently, row labels are amino acid identifiers.
-- PSSMs are stored as linear arrays of integers. By default, we store
-- them column-by-column, M values for the first column followed by M
-- values for the second column, and so on. In order to provide
-- flexibility for external applications, the boolean field "byrow" is
-- provided to specify the storage order.
Pssm ::= SEQUENCE {
-- Is the this a protein or nucleotide scoring matrix?
-- PSSM identifier
identifier Object-id OPTIONAL,
-- The dimensions of the matrix are returned so the client can
-- verify that all data was received.
numRows INTEGER, -- number of rows
numColumns INTEGER, -- number of columns
-- row-labels is given to note the order of residue types so that it can
-- be cross-checked between applications.
-- If this field is not given, the matrix values are presented in
-- order of the alphabet ncbistdaa is used for protein, ncbi4na for nucl.
-- for proteins the values returned correspond to
-- (-,-), (-,A), (-,B), (-,C) ... (A,-), (A,A), (A,B), (A,C) ...
rowLabels SEQUENCE OF VisibleString OPTIONAL,
-- are matrices stored row by row?
-- PSSM representative sequence (master)
query Seq-entry OPTIONAL,
-- both intermediateData and finalData can be provided, but at least one of
-- them must be provided.
-- N.B.: by default PSI-BLAST will return the PSSM in its PssmIntermediateData
-- representation.
-- Intermediate or final data for the PSSM
intermediateData PssmIntermediateData OPTIONAL,
-- Final representation for the PSSM
finalData PssmFinalData OPTIONAL
-- This structure is used to create the RPS-BLAST database auxiliary file
-- (*.aux) and it contains parameters set at creation time of the PSSM.
-- Also, the matrixName field is used by formatrpsdb to build a PSSM from
-- a Pssm structure which only contains PssmIntermediateData.
FormatRpsDbParameters ::= SEQUENCE {
-- name of the underlying score matrix whose frequency ratios were
-- used in PSSM construction (e.g.: BLOSUM62)
matrixName VisibleString,
-- gap opening penalty corresponding to the matrix above
-- gap extension penalty corresponding to the matrix above
-- Populated by PSSM engine of PSI-BLAST, original source for these values
-- are the PSI-BLAST options specified using the BLAST options API
PssmParameters ::= SEQUENCE {
-- pseudocount constant used for PSSM. This field corresponds to beta in
-- equation 2 of Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
-- data needed by formatrpsdb to create RPS-BLAST databases. matrixName is
-- populated by PSI-BLAST
rpsdbparams FormatRpsDbParameters OPTIONAL,
-- alignment constraints needed by sequence-structure threader
-- and other global or local block-alignment algorithms
constraints CoreDef OPTIONAL,
-- bit score threshold for specific conserved domain hits
bitScoreThresh REAL OPTIONAL,
-- conserved functional sites with annotations
annotatedSites Site-annot-set OPTIONAL
-- Envelope containing PSSM and the parameters used to create it.
-- Provided for use in PSI-BLAST, formatrpsdb, and for the structure group.
PssmWithParameters ::= SEQUENCE {
-- This field is applicable to PSI-BLAST and formatrpsdb.
-- When both the intermediate and final PSSM data are provided in this
-- field, the final data (matrix of scores and associated statistical
-- parameters) takes precedence and that data is used for further
-- processing. The rationale for this is that the PSSM's scores and
-- statistical parameters might have been calculated by other applications
-- and it might not be possible to recreate it by using PSI-BLAST's PSSM
-- engine.
pssm Pssm,
-- This field's rpsdbparams is used to specify the values of options
-- for processing by formatrpsdb. If these are not set, the command
-- line defaults of formatrpsdb are applied. This field is used
-- by PSI-BLAST to verify that the underlying scorem matrix used to BUILD
-- the PSSM is the same as the one being specified through the BLAST
-- Options API. If this field is omitted, no verification will be
-- performed, so be careful to keep track of what matrix was used to build
-- the PSSM or else the results produced by PSI-BLAST will be unreliable.
params PssmParameters OPTIONAL