mirror of
https://github.com/titanscouting/tra-analysis.git
synced 2024-11-15 07:36:18 +00:00
222 lines
9.0 KiB
Protocol Buffer
222 lines
9.0 KiB
Protocol Buffer
// Copyright 2016 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
syntax = "proto3";
|
|
|
|
package google.genomics.v1;
|
|
|
|
import "google/api/annotations.proto";
|
|
import "google/genomics/v1/cigar.proto";
|
|
import "google/genomics/v1/position.proto";
|
|
import "google/protobuf/struct.proto";
|
|
|
|
option cc_enable_arenas = true;
|
|
option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
|
|
option java_multiple_files = true;
|
|
option java_outer_classname = "ReadAlignmentProto";
|
|
option java_package = "com.google.genomics.v1";
|
|
|
|
|
|
// A linear alignment can be represented by one CIGAR string. Describes the
|
|
// mapped position and local alignment of the read to the reference.
|
|
message LinearAlignment {
|
|
// The position of this alignment.
|
|
Position position = 1;
|
|
|
|
// The mapping quality of this alignment. Represents how likely
|
|
// the read maps to this position as opposed to other locations.
|
|
//
|
|
// Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
|
|
// the nearest integer.
|
|
int32 mapping_quality = 2;
|
|
|
|
// Represents the local alignment of this sequence (alignment matches, indels,
|
|
// etc) against the reference.
|
|
repeated CigarUnit cigar = 3;
|
|
}
|
|
|
|
// A read alignment describes a linear alignment of a string of DNA to a
|
|
// [reference sequence][google.genomics.v1.Reference], in addition to metadata
|
|
// about the fragment (the molecule of DNA sequenced) and the read (the bases
|
|
// which were read by the sequencer). A read is equivalent to a line in a SAM
|
|
// file. A read belongs to exactly one read group and exactly one
|
|
// [read group set][google.genomics.v1.ReadGroupSet].
|
|
//
|
|
// For more genomics resource definitions, see [Fundamentals of Google
|
|
// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
|
|
//
|
|
// ### Reverse-stranded reads
|
|
//
|
|
// Mapped reads (reads having a non-null `alignment`) can be aligned to either
|
|
// the forward or the reverse strand of their associated reference. Strandedness
|
|
// of a mapped read is encoded by `alignment.position.reverseStrand`.
|
|
//
|
|
// If we consider the reference to be a forward-stranded coordinate space of
|
|
// `[0, reference.length)` with `0` as the left-most position and
|
|
// `reference.length` as the right-most position, reads are always aligned left
|
|
// to right. That is, `alignment.position.position` always refers to the
|
|
// left-most reference coordinate and `alignment.cigar` describes the alignment
|
|
// of this read to the reference from left to right. All per-base fields such as
|
|
// `alignedSequence` and `alignedQuality` share this same left-to-right
|
|
// orientation; this is true of reads which are aligned to either strand. For
|
|
// reverse-stranded reads, this means that `alignedSequence` is the reverse
|
|
// complement of the bases that were originally reported by the sequencing
|
|
// machine.
|
|
//
|
|
// ### Generating a reference-aligned sequence string
|
|
//
|
|
// When interacting with mapped reads, it's often useful to produce a string
|
|
// representing the local alignment of the read to reference. The following
|
|
// pseudocode demonstrates one way of doing this:
|
|
//
|
|
// out = ""
|
|
// offset = 0
|
|
// for c in read.alignment.cigar {
|
|
// switch c.operation {
|
|
// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
|
|
// out += read.alignedSequence[offset:offset+c.operationLength]
|
|
// offset += c.operationLength
|
|
// break
|
|
// case "CLIP_SOFT", "INSERT":
|
|
// offset += c.operationLength
|
|
// break
|
|
// case "PAD":
|
|
// out += repeat("*", c.operationLength)
|
|
// break
|
|
// case "DELETE":
|
|
// out += repeat("-", c.operationLength)
|
|
// break
|
|
// case "SKIP":
|
|
// out += repeat(" ", c.operationLength)
|
|
// break
|
|
// case "CLIP_HARD":
|
|
// break
|
|
// }
|
|
// }
|
|
// return out
|
|
//
|
|
// ### Converting to SAM's CIGAR string
|
|
//
|
|
// The following pseudocode generates a SAM CIGAR string from the
|
|
// `cigar` field. Note that this is a lossy conversion
|
|
// (`cigar.referenceSequence` is lost).
|
|
//
|
|
// cigarMap = {
|
|
// "ALIGNMENT_MATCH": "M",
|
|
// "INSERT": "I",
|
|
// "DELETE": "D",
|
|
// "SKIP": "N",
|
|
// "CLIP_SOFT": "S",
|
|
// "CLIP_HARD": "H",
|
|
// "PAD": "P",
|
|
// "SEQUENCE_MATCH": "=",
|
|
// "SEQUENCE_MISMATCH": "X",
|
|
// }
|
|
// cigarStr = ""
|
|
// for c in read.alignment.cigar {
|
|
// cigarStr += c.operationLength + cigarMap[c.operation]
|
|
// }
|
|
// return cigarStr
|
|
message Read {
|
|
// The server-generated read ID, unique across all reads. This is different
|
|
// from the `fragmentName`.
|
|
string id = 1;
|
|
|
|
// The ID of the read group this read belongs to. A read belongs to exactly
|
|
// one read group. This is a server-generated ID which is distinct from SAM's
|
|
// RG tag (for that value, see
|
|
// [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
|
|
string read_group_id = 2;
|
|
|
|
// The ID of the read group set this read belongs to. A read belongs to
|
|
// exactly one read group set.
|
|
string read_group_set_id = 3;
|
|
|
|
// The fragment name. Equivalent to QNAME (query template name) in SAM.
|
|
string fragment_name = 4;
|
|
|
|
// The orientation and the distance between reads from the fragment are
|
|
// consistent with the sequencing protocol (SAM flag 0x2).
|
|
bool proper_placement = 5;
|
|
|
|
// The fragment is a PCR or optical duplicate (SAM flag 0x400).
|
|
bool duplicate_fragment = 6;
|
|
|
|
// The observed length of the fragment, equivalent to TLEN in SAM.
|
|
int32 fragment_length = 7;
|
|
|
|
// The read number in sequencing. 0-based and less than numberReads. This
|
|
// field replaces SAM flag 0x40 and 0x80.
|
|
int32 read_number = 8;
|
|
|
|
// The number of reads in the fragment (extension to SAM flag 0x1).
|
|
int32 number_reads = 9;
|
|
|
|
// Whether this read did not pass filters, such as platform or vendor quality
|
|
// controls (SAM flag 0x200).
|
|
bool failed_vendor_quality_checks = 10;
|
|
|
|
// The linear alignment for this alignment record. This field is null for
|
|
// unmapped reads.
|
|
LinearAlignment alignment = 11;
|
|
|
|
// Whether this alignment is secondary. Equivalent to SAM flag 0x100.
|
|
// A secondary alignment represents an alternative to the primary alignment
|
|
// for this read. Aligners may return secondary alignments if a read can map
|
|
// ambiguously to multiple coordinates in the genome. By convention, each read
|
|
// has one and only one alignment where both `secondaryAlignment`
|
|
// and `supplementaryAlignment` are false.
|
|
bool secondary_alignment = 12;
|
|
|
|
// Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
|
|
// Supplementary alignments are used in the representation of a chimeric
|
|
// alignment. In a chimeric alignment, a read is split into multiple
|
|
// linear alignments that map to different reference contigs. The first
|
|
// linear alignment in the read will be designated as the representative
|
|
// alignment; the remaining linear alignments will be designated as
|
|
// supplementary alignments. These alignments may have different mapping
|
|
// quality scores. In each linear alignment in a chimeric alignment, the read
|
|
// will be hard clipped. The `alignedSequence` and
|
|
// `alignedQuality` fields in the alignment record will only
|
|
// represent the bases for its respective linear alignment.
|
|
bool supplementary_alignment = 13;
|
|
|
|
// The bases of the read sequence contained in this alignment record,
|
|
// **without CIGAR operations applied** (equivalent to SEQ in SAM).
|
|
// `alignedSequence` and `alignedQuality` may be
|
|
// shorter than the full read sequence and quality. This will occur if the
|
|
// alignment is part of a chimeric alignment, or if the read was trimmed. When
|
|
// this occurs, the CIGAR for this read will begin/end with a hard clip
|
|
// operator that will indicate the length of the excised sequence.
|
|
string aligned_sequence = 14;
|
|
|
|
// The quality of the read sequence contained in this alignment record
|
|
// (equivalent to QUAL in SAM).
|
|
// `alignedSequence` and `alignedQuality` may be shorter than the full read
|
|
// sequence and quality. This will occur if the alignment is part of a
|
|
// chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
|
|
// for this read will begin/end with a hard clip operator that will indicate
|
|
// the length of the excised sequence.
|
|
repeated int32 aligned_quality = 15;
|
|
|
|
// The mapping of the primary alignment of the
|
|
// `(readNumber+1)%numberReads` read in the fragment. It replaces
|
|
// mate position and mate strand in SAM.
|
|
Position next_mate_position = 16;
|
|
|
|
// A map of additional read alignment information. This must be of the form
|
|
// map<string, string[]> (string key mapping to a list of string values).
|
|
map<string, google.protobuf.ListValue> info = 17;
|
|
}
|