From 56bd759df1d0c750a065b8c845e93d5dfa6b549d Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Sat, 8 Aug 2015 13:49:04 -0700 Subject: proj/gentoo: Initial commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit represents a new era for Gentoo: Storing the gentoo-x86 tree in Git, as converted from CVS. This commit is the start of the NEW history. Any historical data is intended to be grafted onto this point. Creation process: 1. Take final CVS checkout snapshot 2. Remove ALL ChangeLog* files 3. Transform all Manifests to thin 4. Remove empty Manifests 5. Convert all stale $Header$/$Id$ CVS keywords to non-expanded Git $Id$ 5.1. Do not touch files with -kb/-ko keyword flags. Signed-off-by: Robin H. Johnson X-Thanks: Alec Warner - did the GSoC 2006 migration tests X-Thanks: Robin H. Johnson - infra guy, herding this project X-Thanks: Nguyen Thai Ngoc Duy - Former Gentoo developer, wrote Git features for the migration X-Thanks: Brian Harring - wrote much python to improve cvs2svn X-Thanks: Rich Freeman - validation scripts X-Thanks: Patrick Lauer - Gentoo dev, running new 2014 work in migration X-Thanks: Michał Górny - scripts, QA, nagging X-Thanks: All of other Gentoo developers - many ideas and lots of paint on the bikeshed --- sci-biology/goby-cpp/Manifest | 3 + sci-biology/goby-cpp/files/Alignments.proto | 597 +++++++++++++++++++++ sci-biology/goby-cpp/files/Reads.proto | 96 ++++ .../files/goby-cpp-2.0.1-underlinking.patch | 16 + sci-biology/goby-cpp/goby-cpp-1.9.7.3.ebuild | 26 + sci-biology/goby-cpp/goby-cpp-1.9.8.1.ebuild | 26 + sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild | 42 ++ sci-biology/goby-cpp/metadata.xml | 5 + 8 files changed, 811 insertions(+) create mode 100644 sci-biology/goby-cpp/Manifest create mode 100644 sci-biology/goby-cpp/files/Alignments.proto create mode 100644 sci-biology/goby-cpp/files/Reads.proto create mode 100644 sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch create mode 100644 sci-biology/goby-cpp/goby-cpp-1.9.7.3.ebuild create mode 100644 sci-biology/goby-cpp/goby-cpp-1.9.8.1.ebuild create mode 100644 sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild create mode 100644 sci-biology/goby-cpp/metadata.xml (limited to 'sci-biology/goby-cpp') diff --git a/sci-biology/goby-cpp/Manifest b/sci-biology/goby-cpp/Manifest new file mode 100644 index 000000000000..b9ab55a817cd --- /dev/null +++ b/sci-biology/goby-cpp/Manifest @@ -0,0 +1,3 @@ +DIST goby_1.9.7.3-cpp.zip 127215 SHA256 8493daa7c850732c6c48d4512bd26b7eec411a729b39d9861a4a6aae08faa674 +DIST goby_1.9.8.1-cpp.zip 134904 SHA256 2f1bd87f2870af178f34a8e7c11819aa9e42f35e20f1985d2ceb054f452e2a97 SHA512 d31cd7f0be19074bfe8da74d9f2510f0e0f15fe6c485bbed8520052468d2cd2f1bc5fcad8b0d6a1586f5acde73db326059f45994ecfbb5fb6c09692d8e155190 WHIRLPOOL 6ce51c46f8802d31068f510f6da13b2920086eafdae24506830b42d79e48eb6ed9cac48a96090a81964daebf4a0c8f21c490ca3b0af2f589ac57647bde1be79f +DIST goby_2.0.1-cpp.zip 177718 SHA256 5ec57b833cb1a0f53e975112d1c360b14a9b17cfff3fb0ad77dd70672c1881db SHA512 992bd10d5538dec1478820f26151dd311f4de13e7947b49f0b06d6cbdd4b71deeb3aa8a4c6a598fb92fbcb9cbf4ff97bf81205c9389d4a0da4443317e48aea9f WHIRLPOOL ab94cf674703917b6f0cde812d0fbcd94e18fb6055b30d6a1eefa1e4cb5b76bbe18c67388c66e25e87e522df9a9946b0eae5a164428abe874a382f5bc39a13d0 diff --git a/sci-biology/goby-cpp/files/Alignments.proto b/sci-biology/goby-cpp/files/Alignments.proto new file mode 100644 index 000000000000..fe7f56647644 --- /dev/null +++ b/sci-biology/goby-cpp/files/Alignments.proto @@ -0,0 +1,597 @@ +package goby; + +option java_package = "edu.cornell.med.icb.goby.alignments"; + +option optimize_for = SPEED; + +/* + This message is written to 'basename'.entries as a very large chunked collection. +*/ +message AlignmentCollection { + repeated AlignmentEntry alignment_entries = 1; +} + + +message AlignmentEntry { + /* Multiplicity of this entry. The number of times this alignment entry would be repeated exactly the same if + query redundancy had not been removed by read factorization. + */ + optional uint32 multiplicity = 7; + + /* + Compressed stream of data. Removed since Goby 2.0 supports chunk codecs. Do not reuse field index 23 + optional bytes compressed_data = 23; + */ + + /* An integer that uniquely identifies the query (a short read) in a set of alignment runs. When several + alignment runs are made with the same set of query sequences, equality of query index means that the query + sequences were the same. (Comparing integers for equality is much faster than comparing strings.) + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 query_index = 1; + /* An integer that uniquely identifies the target (e.g., a chromosome) in a set of alignment runs. When several + alignment runs are made with the same set of target sequences, equality of target index means that the target + sequence was the same across the runs. (Comparing integers for equality is much faster than comparing strings.) + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 target_index = 2; + /* + The position on the target of the start of the alignment between the query and the target. + In the following example, position is 3 because the third base of the query 'C' was aligned with + position 3 of the reference (two read bases were soft clipped: "ct"). This example shows that the + alignment can start at a mismatch if it was so constructed by the aligner. + + 0123456789 + AAAAGTCAAA target + ctCGTC query + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 position = 3; + + /* + True when the query matches the target on the reverse strand + */ + optional bool matching_reverse_strand = 6; + + /* + The position on the query where the alignment starts. This value is different from zero + when some bases/residues of the query could not be aligned with the target. + TODO: Rename this to left_trim. Add a right_trim property. + */ + optional uint32 query_position = 5; + + /* + The score of the alignment, where larger scores indicate better matches between the query and the target. + If an aligner outputs only the number of mismatches between query and target, the score is taken to be + -(#mismatches(query,target)). + */ + optional float score = 4; + + /* + Number of bases/residues that differ in the alignment between query and target sequences. + */ + optional uint32 number_of_mismatches = 8; + + /* + Cumulative number of insertions and/or deletions present in the alignment. + */ + optional uint32 number_of_indels = 9; + + /* + Number of bases that have been aligned for the query. Please note that query_aligned_length must be + less or equal to query_length. + */ + optional uint32 query_aligned_length = 11; + + /* + Number of bases that have been aligned for the target. + */ + optional uint32 target_aligned_length = 12; + + repeated SequenceVariation sequence_variations = 13; + + /* + Length of the query sequence. + */ + optional uint32 query_length = 10; + /* + Mapping Quality (phred-scaled posterior probability that the mapping + position of this read is incorrect). Please note that different aligners + may estimate mapping quality with different approaches, resulting in aligner + specific differences in the distribution of mapping quality. It is recommended + to condition mapping quality on the aligner that produced the specific alignment + being processed. See aligner name and version in the header. + Note that the following description is preliminary. A clear specification is + needed: + The mapping quality should be proportional to the + log of the probability that the given mapping is the "correct" one. + So if there are five equally good mappings of a read to the genome, + the probability of each would be 0.2, and the mapping quality would be + something like -10*log10(1-0.2) = 1. If a mapping is highly likely, + say a 1e-4 of it being wrong, then the mapping quality would be + -10*log10(1e-4) = 40. + */ + optional int32 mapping_quality = 14; + + /* + If this read was aligned with a pair, the flags for the pair alignment (based on SAM): + 000000001 paired + 000000010 properly paired + 000000100 read unmapped + 000001000 mate unmapped + 000010000 read reverse strand + 000100000 mate reverse strand + 001000000 first in pair + 010000000 second in pair + 100000000 not primary alignment + */ + optional uint32 pair_flags = 15; + + /* + If there is an alignment entry for the paired read (the paired read was mapped), a link to the entry is given. + */ + optional RelatedAlignmentEntry pair_alignment_link = 16; + + /* Index of the read fragment from which this alignment was obtained. */ + optional uint32 fragment_index = 17; + + /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more + alignment entries, one for each matching part of the read, and link these entries with + spliced_alignment_links. The field spliced_forward_alignment_link points to the next + AlignmentEntry in the chain of spliced alignments. + */ + optional RelatedAlignmentEntry spliced_forward_alignment_link = 18; + + /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more + alignment entries, one for each matching part of the read, and link these entries with + spliced_alignment_links. The field spliced_backward_alignment_link points to the previous + AlignmentEntry in the chain of spliced alignments. + */ + optional RelatedAlignmentEntry spliced_backward_alignment_link = 22; + + /* + If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two alignment entries, one for each + matching part of the read, and flag describes the spliced_alignment_link with these + binary flags: + 000000001 normal + 000000010 novel + */ + optional uint32 spliced_flags = 19; + + /* The size of the insert used when making the sequence library. This is the total size of the DNA + fragment to sequence, without the adapters. This is not the length of sequence that separates the reads. + See http://seqanswers.com/forums/showthread.php?t=8730 for details. Insert size is inferred for each pair + of reads by the aligner and is recorded here if was estimated (i.e., for paired-end reads). + */ + optional sint32 insert_size = 20; + + /* + The sample index. Uniquely identifies the aligned sample this read was read from. Storing the sample index in the + alignment entry makes it possible to concat alignments from different origins and track what sample originally + contained each entry. + */ + optional uint32 sample_index = 21; + /* + The total number of times the query index associated with this entry occurs across the entire alignment file. + + This field is used to purge queryIndex->smallIndex associations after all instances of a queryindex have + been seen (see QueryIndexPermutation class). When each entry has a value for this field, the header field + query_index_occurrences is true. + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 query_index_occurrences = 25; + /* + The total number of times the read matches the reference across the entire alignment file. This differs from + query_index_occurrences because reads that are matching through splice and pair links count as one for ambiguity. + The field can be used to filter by ambiguity-threshold on the fly after an alignment has been done (to restrict + entries to more smaller thresholds). When each entry has a value for this field, the header field + ambiguity_stored_in_entries is true. + + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 ambiguity = 27; + /* + List of BAM attributes, if the alignment was imported from BAM. The attributes are stored in exactly the format + allowed for BAM. For instance, X0:i:9 X1:i:1 MD:Z:68 RG:Z:SRR084825 will be stored as four strings: + "X0:i:9", "X1:i:1", "MD:Z:68", "RG:Z:SRR084825". Note that sam-to-compact will interpret some BAM attributes + and populate goby native fields. Such tags do not appear in bam_attributes, and are instead re-generated from + the corresponding goby native fields. + Since Goby 2.0. + */ + repeated string bam_attributes = 50; + /* + Quality scores for all bases of the read. + Since Goby 2.0. + */ + optional bytes read_quality_scores = 55; + + /* + Origin index. An integer that references a ReadOriginInfo message in the alignment header and + makes it possible to track the origin of the read (especially useful after several alignments + have been merged/concatenated). + (Since Goby 2.0). + */ + optional uint32 read_origin_index = 26; + /* + Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially + erroneous bases, or bases that belong to a different part of the reference genome. Left clipped bases are + stored in this field as character bases, or as an equal sign character '=' when the clipped base did match + the reference base. For instance "A=G" for three soft-clipped bases, the middle one matching the genome at + this position. The number of bases in softClippedBasesLeft is exactly equal to queryPosition. + */ + optional string softClippedBasesLeft = 30; + /* + Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially + erroneous bases, or bases that belong to a different part of the reference genome. Right clipped bases are + stored in this field as character bases, or as an equal sign character '=' when the clipped base did match + the reference base. The number of bases in softClippedBasesRight is exactly equal + to queryLength - queryAlignedLength - queryPosition. + */ + optional string softClippedBasesRight = 31; + + /* + Quality scores for bases in softClippedBasesLeft. Stored in Phred Units. + */ + optional bytes softClippedQualityLeft = 32; + /* + Quality scores for bases in softClippedBasesRight. Stored in Phred Units. + */ + optional bytes softClippedQualityRight = 33; + /* + Sequence for a read placed near this entry, but unmapped to the reference sequence. For instance, used to record + the sequence of a mate that did not map to the reference. We know that the mate maps in the proximity of this entry + (it is placed) but are unable to map it to a specific genomic position. The sequence is always given as obtained + from the reads file. + */ + optional string placedUnmappedSequence=40; + /* + Quality scores for a read placed near this entry. Phred units. + */ + optional bytes placedUnmappedQuality=41; + + /* + Read name. In SAM/BAM this is referred to as QNAME. Paired and segmented reads will have the same Read name. + */ + optional string readName=42; +} + +/* A link to another alignment entry. This message type is used to represent relations + between alignments, such as the relation between the two read fragments in a paired-end protocol, + or the relation between parts of reads that align through an exon exon junction and map in + different locations of the genome. + */ +message RelatedAlignmentEntry { + /* Target index of the location where the other alignment entry is mapped. + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 target_index = 1; + + /* Position on the reference where the other alignment entry is mapped. * + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 position = 2; + + /* Index of the fragment for the related alignment entry. This index + makes it possible to identify which of the read fragments mapped to the given + location is related to the source alignment entry. + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 fragment_index = 3; + + optional uint32 optimized_index=50; +} + +/* + Represents sequence variations between the query and the reference sequences. Many variations can be represented. + For instance, an insertion at position 5 in the reference would be represented as from="A", to="" position=5. + A mutation T->G at position 6 would be rendered as from="T", to="G" position=6. Padded alignments (see SAM description) + can be described by a combination of pair-wise alignments, where the gap character '-' is used to indicate that no + base exists in the sequence considered for the alignment position, for instance: + + - Padding example: + + 123 (<-positions) +ref A-C + A-T [from="-" to="" position=2] [from="C" to="T" position=3] + ACT [from="" to="C" position=2] [from="C" to="T" position=3] + A-T [from="-" to="" position=2] [from="C" to="T" position=3] + + - Mutation example: + 123 (<-positions) +ref ATT + ACT [from="T" to="C" position=2] + + -- Example of deletion in a read: + 123 (<-positions) +ref ATT + A-T [from="T" to="-" position=2] + + -- Example of insertion of two base pairs in a read: + 12345 (<-positions) +ref A--TT + ACCTT [from="" to="CC" position=2] + + */ +message SequenceVariation { + /* The reference bases. Can include one or more gap characters '-', to indicate that the reference sequence has + no base at this alignment position. + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional string from = 2; + /* The read bases that differ from the reference sequence. Can include one or more gap characters '-', to indicate + that the query sequence has no base at this alignment position. + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional string to = 1; + /* + The position of the variation on the read, as if the read always matched on the forward strand. + Adding position to the index where the reference starts aligning the read yields the position of the variation + in reference/target sequence space. Since position starts at one the resulting position will also be one based. + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 position = 3; + /* + The position of the variation, starting from the beginning of the aligned read (position 1), and up to the length + of the read (inclusive). Use this index if you need to know how far the variation is observed from the beginning + of the sequenced read. When the read has an insertion, this index records the position immediately before the base + where the bases are inserted (these bases are in the to field). + When the read has a deletion, read_index records the position in the read after which the bases that would align + in the reference are missing (these bases are in the from field). + This field is required (enforced by semantic validation in Goby 2.0+). + */ + optional uint32 read_index = 5; + + /** + The read base quality scores for those bases that are given in the to field. This field + is populated when the reads used to perform the search include quality scores, and when + the alignment parser can extract the information from the aligner's output. + (this option is currently not implemented in Goby.) + */ + optional bytes to_quality = 4; + +} +/* + This message is written to 'basename'.header +*/ + +message AlignmentHeader { + /* + The smallest possible query index in this alignment. Data stored as an array where + queryIndex is the array index will be stored with only the elements in the inclusive + range [smallestSplitQueryIndex largestSplitQueryIndex] + Such data structures include queryLength and some arrays in the TooManyHits data + structure. + */ + optional uint32 smallest_split_query_index = 9; + /* + The largest possible query index in this alignment. Data stored as an array where + queryIndex is the array index will be stored with only the elements in the inclusive + range [smallestSplitQueryIndex largestSplitQueryIndex] + Such data structures include queryLength and some arrays in the TooManyHits data + structure. + */ + optional uint32 largest_split_query_index = 11; + + /* Mapping from query identifier name to query index (as used in alignment entries). + */ + optional IdentifierMapping query_name_mapping = 1; + + /* Mapping from target identifier name to target index (as used in alignment entries). + */ + optional IdentifierMapping target_name_mapping = 2; + + /* + The number of query sequences + */ + optional uint32 number_of_queries = 5; + /* + The number of target sequences + */ + optional uint32 number_of_targets = 6; + /* + The number of reads that were aligned to the reference and are represented in this alignment archive. + */ + optional uint32 number_of_aligned_reads = 7; + + /* + Length of the query sequences. One number per query, in the order of increasing query index. + This information has been moved to the individual alignment entries. + */ + repeated uint32 query_length = 3 [deprecated = true]; + /* + If query length is constant across all the queries, this field contains the constant length. + In such cases, query_length will be empty. + */ + optional uint32 constant_query_length = 10; + + /* + Length of the target sequences. One number per target, in the order of increasing target index. + The target indexes must be 0..(number of targets - 1). + */ + repeated uint32 target_length = 8; + /* + Indicates whether this alignment is sorted by position. True: the alignment entries occur in sorted + order, such that entry a occurs before entry b if a.targetIndex< b.targetIndex or, when entries + have the same target, when a.position < b.position. + */ + optional bool sorted = 13; + + /* + Indicates whether this alignment is indexed by position. When this attribute is true, a file called + 'basename'.index exists that contains the AlignmentIndex message (GZip compressed). + */ + optional bool indexed = 14; + /* + True when query lengths are stored in alignment entries (Goby 1.7+). + */ + optional bool query_lengths_stored_in_entries = 15; + /* + Name of the aligner that produced this alignment. + */ + optional string aligner_name = 17; + /* + Version number for the aligner implementation that produced this alignment. + */ + optional string aligner_version = 18; + /* + The version of Goby that created this alignment file. + */ + optional string version = 25; + + /* + Sample basenames, in the order of increasing sampleIndex, starting with sampleIndex=0. + */ + + repeated string sample_basename = 30; + + /* + This field is true when the query indices of alignment entries were permuted to smaller indices. Only sorted + alignments can have query_indices_were_permuted=true. When the field is true, and you need to retrieve the + original query-index of an alignment (because you want to retrieve the specific read(s) from a read file for + instance), you will need the information in the permutation file (extension basename.perm) and transform back + each small index of interest to the original query index. + */ + optional bool query_indices_were_permuted = 26; + /* + This field is true when entries in the alignment .entries file all have the query_index_occurrences field populated + (Since Goby 2.0). + */ + optional bool query_index_occurrences = 35; + + /* + This field is true when entries in the alignment .entries file all have the ambiguity field populated + (Since Goby 2.0). + */ + optional bool ambiguity_stored_in_entries = 36; + /* + This field is true when entries in the alignment .entries file all have the read_quality_score field populated. + (Since Goby 2.0). + */ + optional bool all_read_quality_scores = 40; + /* + A description of the origin of sets of reads. Serves a similar function to BAM read groups, but more flexible and + efficient. Instead of storing strings, we use integers in the entries. + Alignemnt entries will link to a specific ReadOriginInfo with the origin_index field. + (Since Goby 2.0). + */ + repeated ReadOriginInfo read_origin = 27; +} + +message IdentifierMapping { + repeated IdentifierInfo mappings = 1; +} + +message IdentifierInfo { + required string name = 1; + required uint32 index = 2; +} + + +/* + A description of the origin of sets of reads. Stored in the Goby alignment header and linked + from alignment entries. Goby makes it possible to adapt origin equivalence rules on the fly + efficiently. To do this, it is sufficient to read the header of the alignment, decide which + ReadOriginInfo instances are equivalent (e.g., by looking at sample, platform, library, or + other fields in the message), then construct a function e(a):int. This function takes + one originIndex parameter and returns another integer that maps to an equivalent class. The + equivalence class can be used to estimate error models for entries that belong to each class, + for instance. + (Since Goby 2.0). + */ +message ReadOriginInfo { + /* + Origin index. An integer that links alignment entries to their origin information. + */ + required uint32 origin_index = 1; + /* + Identifier that describes the origin of the reads. This field is compatible with the ID/platform field of BAM read + groups. Free text. + */ + required string origin_id = 2; + /* + The sample from which the reads were sequenced. This field is compatible with the SM/sample field of BAM read + groups. Free text. + */ + optional string sample = 4; + /* + The platform on which the reads were sequenced. This field is compatible with the PL/platform field of BAM read + groups. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO. + */ + optional string platform = 5; + /* + The library from which the reads were sequenced. This field is compatible with the LB/library field of BAM read + groups. Free text. + */ + optional string library = 8; + /* + The platform unit on which the reads were sequenced. This field for compatibility with samtools. + */ + optional string platform_unit = 12; + /* + The date the reads were sequenced. Useful to identify batch effects, in the format dd:MMM:yyyy. + The month is Jan, Feb, etc. to avoid all confusion with days when day<=12. + */ + optional string run_date = 6; +} + +/* + This message is written to 'basename'.tmh +*/ + +message AlignmentTooManyHits { + /* + The threshold used by the aligner to determine that a query is ambiguous and should be dropped. + Referred to as parameter k below. + */ + required uint32 aligner_threshold = 2; + /* + The hits that are assigned to several (>k) reference location. + */ + repeated AmbiguousLocation hits = 1; + +} + +message AmbiguousLocation { + /* + The index of the query that matched too many times. + */ + required uint32 query_index = 1; + /* + The number of hits that triggered membership in the too many hits list. The query may hit more + locations than reported here, since some alignment tools will just drop queries that match above + a threshold and stop counting. This number can be >=k. + */ + required uint32 at_least_number_of_hits = 2; + /** +The length of the part of the query sequence that could be matched to the target (also called depth). +May be less than the length of the query sequence, in which case the match was not perfect. When merging +alignments produced by searching different reference sequences, consider only at_least_number_of_hits +from alignments that have exactly the longer depth for the query. */ + optional uint32 length_of_match = 3; +} + +/* + This message is written to 'basename'.index + */ +message AlignmentIndex { + /* + Stores one element by target sequence. Each element is the cumulative target length for the target + stored at index i. Assume there are four target sequences, with lengths {10, 12, 15, 34}. The field + targetPositionOffsets will contain: {0,10,22,37}. Such offsets can be used to calculate the absolute + position of a genomic location. Given targetIndex and positionOnReference, the absolute location + is defined as targetPositionOffsets[targetIndex]+positionOnReference. + */ + repeated uint32 target_position_offsets = 1 [packed = true]; + /* + The byte offsets into the compressed entries file. Byte offsets are matched with absolute position + by index. There should be as many elements in offsets as there are in absolutePosition + where chunks start which represent entries whose absolute positions are less than + */ + repeated uint64 offsets = 2 [packed = true]; + /* + The absolute positions of the first entry in the chunk that immediately start at offset. One element + per chunk in the 'basename'.entries file. + */ + repeated uint64 absolute_positions = 3 [packed = true]; + +} diff --git a/sci-biology/goby-cpp/files/Reads.proto b/sci-biology/goby-cpp/files/Reads.proto new file mode 100644 index 000000000000..32c1244a3eb3 --- /dev/null +++ b/sci-biology/goby-cpp/files/Reads.proto @@ -0,0 +1,96 @@ +package goby; + +option java_package = "edu.cornell.med.icb.goby.reads"; +option optimize_for = SPEED; + +message ReadCollection { + repeated ReadEntry reads = 1; +} + +message ReadEntry { + /* + Index of a read. + */ + required uint32 read_index = 1; + /* + Index of the barcode, if any. + */ + optional uint32 barcode_index = 10; + /* + Read identifier/name may be present. + */ + optional string read_identifier = 23; + /* + Additional description about the read (from Fasta/Q format). + */ + optional string description = 22; + /* + Length of the sequence. + */ + required uint32 read_length = 2; + /* + Sequence, encoded as ascii characters stored in single bytes. + */ + optional bytes sequence = 3; + /* + The second sequence in a pair. Stored the same way as the sequence attribute. + */ + optional bytes sequence_pair = 5; + /* + Length of the second sequence in a pair. + */ + optional uint32 read_length_pair = 6; + /* + Quality scores in Phred units, stored as single bytes (0-255). + */ + optional bytes quality_scores = 4; + /* + Quality scores for the second sequence in a pair. Stored as the 'qualityScores' attribute. + */ + optional bytes quality_scores_pair = 7; + /* + Compressed stream of data. The first byte indicates the compression/decompression method (codec). The remaining bytes are + content compressed with the codec. + */ + optional bytes compressed_data = 8; + /* + Stores meta-data about the reads. Typically meta-data is stored in the very first read of a + read collection, with the understanding that the meta-data applies to all the reads in the + collection. Meta-data can be used to store information about when the sample was sequenced, + or other information of interest. The key-value pair format is sufficiently flexible to + accomodate a variety of needs. The following keys are pre-defined. Please use pre-defined + keys so that automated tools can use metadata in relatively standard way. Please note that + some keys provide a format for the value. This format should also be followed to garantee + that meta data can be used computationally in fully automatic manner. + + key="sequencing-run-start-date" value="MM/DD/YYYY" Used to record when the sequencing run + was initiated on the instrument. Can be used to detect batch effect in a large set of samples. + key="platform" value="". Value is free text, but the following terms are pre-defined. + Illumina GaIIx + Illumina HiSeq 1000 + Illumina HiSeq 2000 + Helicos Heliscope + LifeTech 5500 SOLiD + LifeTech 5500xl SOLiD + Roche 454 GS FLX Ti + + key="organism" value="species name" + Since Goby 1.9.1 + */ + repeated MetaData meta_data = 25; + +} +/* + A message to store a key/value pair and represent metadata about reads. + Since Goby 1.9.1 + */ +message MetaData { + /* + Provides the key. See examples in the documentation of meta_data for ReadEntry. + */ + required string key=1; + /* + Describes the value associated with the key. See examples in the documentation of meta_data for ReadEntry. + */ + required string value=2; +} diff --git a/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch b/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch new file mode 100644 index 000000000000..415785466af7 --- /dev/null +++ b/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch @@ -0,0 +1,16 @@ + src/Makefile.am | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/Makefile.am b/src/Makefile.am +index 1033382..33ca906 100644 +--- a/src/Makefile.am ++++ b/src/Makefile.am +@@ -84,7 +84,7 @@ GobyReadsStats_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_D + GobyReadsStats_SOURCES = \ + GobyReadsStats.cc + +-GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB} ++GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB} -lz + GobyFastaToCompact_SOURCES = \ + GobyFastaToCompact.cc + diff --git a/sci-biology/goby-cpp/goby-cpp-1.9.7.3.ebuild b/sci-biology/goby-cpp/goby-cpp-1.9.7.3.ebuild new file mode 100644 index 000000000000..1b15aaac3ac6 --- /dev/null +++ b/sci-biology/goby-cpp/goby-cpp-1.9.7.3.ebuild @@ -0,0 +1,26 @@ +# Copyright 1999-2011 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 +# $Id$ + +EAPI=4 + +inherit autotools + +DESCRIPTION="A DNA sequencing data management framework - C/C++ API" +HOMEPAGE="http://campagnelab.org/software/goby/" +SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip" + +LICENSE="GPL-3" +SLOT="0" +IUSE="" +KEYWORDS="~amd64 ~x86" + +DEPEND=">=dev-libs/protobuf-2.4.1 + >=dev-libs/libpcre-8.12" +RDEPEND="${DEPEND}" + +S="${WORKDIR}/goby_${PV}/cpp" + +src_prepare() { + eautoreconf +} diff --git a/sci-biology/goby-cpp/goby-cpp-1.9.8.1.ebuild b/sci-biology/goby-cpp/goby-cpp-1.9.8.1.ebuild new file mode 100644 index 000000000000..c74f57306255 --- /dev/null +++ b/sci-biology/goby-cpp/goby-cpp-1.9.8.1.ebuild @@ -0,0 +1,26 @@ +# Copyright 1999-2012 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 +# $Id$ + +EAPI=4 + +inherit autotools + +DESCRIPTION="A DNA sequencing data management framework - C/C++ API" +HOMEPAGE="http://campagnelab.org/software/goby/" +SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip" + +LICENSE="GPL-3" +SLOT="0" +IUSE="" +KEYWORDS="~amd64 ~x86" + +DEPEND=">=dev-libs/protobuf-2.4.1 + >=dev-libs/libpcre-8.12" +RDEPEND="${DEPEND}" + +S="${WORKDIR}/goby_${PV}/cpp" + +src_prepare() { + eautoreconf +} diff --git a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild new file mode 100644 index 000000000000..2cadda103999 --- /dev/null +++ b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild @@ -0,0 +1,42 @@ +# Copyright 1999-2015 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 +# $Id$ + +EAPI=5 + +AUTOTOOLS_AUTORECONF=yes + +inherit autotools-utils + +DESCRIPTION="A DNA sequencing data management framework - C/C++ API" +HOMEPAGE="http://campagnelab.org/software/goby/" +SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip" + +LICENSE="GPL-3" +SLOT="0" +KEYWORDS="~amd64 ~x86" +IUSE="static-libs" + +DEPEND=" + >=dev-libs/protobuf-2.4.1 + >=dev-libs/libpcre-8.12" +RDEPEND="${DEPEND}" + +S="${WORKDIR}/${PV}/cpp" + +PATCHES=( + "${FILESDIR}"/${P}-underlinking.patch +) + +src_prepare() { + sed \ + -e '/BUILD_TIMESTAMP/s:\(goby/timestamp.h\):$(top_srcdir)/src/\1:g' \ + -i src/Makefile.am || die + + pushd src/goby > /dev/null || die + cp "${FILESDIR}"/*.proto . || die + protoc --cpp_out=. *.proto || die + popd > /dev/null || die + + autotools-utils_src_prepare +} diff --git a/sci-biology/goby-cpp/metadata.xml b/sci-biology/goby-cpp/metadata.xml new file mode 100644 index 000000000000..f17a827e3101 --- /dev/null +++ b/sci-biology/goby-cpp/metadata.xml @@ -0,0 +1,5 @@ + + + + sci-biology + -- cgit v1.2.3-65-gdbad