diff --git a/.gitignore b/.gitignore index 9660ab63..4ab68438 100644 --- a/.gitignore +++ b/.gitignore @@ -276,3 +276,4 @@ dmypy.json # Pyre type checker .pyre/ +*.nodegraph diff --git a/README.md b/README.md index ddc95c67..ce4e794f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,8 @@ Kmer-hashing tools ================================ -[![image](https://img.shields.io/travis/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D.svg)](https://travis-ci.org/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D) - - -[![codecov](https://codecov.io/gh/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D/branch/master/graph/badge.svg)](https://codecov.io/gh/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D) - -[![image](https://img.shields.io/pypi/v/%7B%7B%20cookiecutter.repo_name%20%7D%7D.svg)](https://pypi.python.org/pypi/%7B%7B%20cookiecutter.repo_name%20%7D%7D) - +[![image](https://img.shields.io/travis/czbiohub/kh-tools.svg)](https://travis-ci.com/czbiohub/kh-tools) +[![codecov](https://codecov.io/gh/czbiohub/kh-tools/branch/master/graph/badge.svg)](https://codecov.io/gh/czbiohub/kh-tools) What is khtools? ------------------------------------- @@ -23,25 +18,91 @@ Installation To install this code, clone this github repository and use pip to install ``` -git clone czbiohub/khtools.git -cd khtools +git clone czbiohub/khtools.git +cd khtools # The "." means "install *this*, the folder where I am now" -pip install . +pip install . ``` Usage ----- -Greet a name multiple times! +### Extract likely protein-coding reads from sequencing data + + +``` +khtools extract_coding peptides.fa.gz *.fastq.gz > coding_peptides.fasta +``` + +#### Save the "coding scores" to a csv + +The "coding score" of each read is calculated by translating each read in six +frames, then is calculatating the +[Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) between any of the +six translated frames of the read and the peptide database. The final coding +score is the maximum Jaccard index across all reading frames. If you'd like to +see the coding scores for all reads, use the `--csv` flag. ``` -$ Kmer-hashing tools hello --name "Rosalind Franklin" --count 10 +khtools extract_coding --csv coding_scores.csv peptides.fa.gz *.fastq.gz > coding_peptides.fasta ``` -Features --------- +#### Save the coding nucleotides to a fasta + +By default, only the coding *peptides* are output. If you'd like to also output +the underlying *nucleotide* sequence, then use the flag `--coding-nucleotide-fasta` + +``` +khtools extract_coding --coding-nucleotide-fasta coding_nucleotides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta +``` -- TODO +#### Save the *non*-coding nucleotides to a fasta + +To see the sequence of reads which were deemed non-coding, use the flag +`--noncoding-nucleotide-fasta`. + +``` +khtools extract_coding --noncoding-nucleotide-fasta noncoding_nucleotides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta +``` + +#### Save the low complexity nucleotides to a fasta + +To see the sequence of reads found to have too low complexity of nucleotide +sequence to evaluate, use the flag `--low-complexity-nucleotide-fasta`. Low +complexity is determined by the same method as the read trimmer +[fastp](https://github.com/OpenGene/fastp) in which we calculate what +percentage of the sequence has consecutive runs of the same base, +or mathematically, how often `seq[i] = seq[i+1]`. The default threshold is +`0.3`. As an example, the sequence `CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACACCC` +would be considered low complexity. While this sequence has many nucleotide +k-mers, it is likely a result of a sequencing error and we ignore it. + +``` +khtools extract_coding --low-complexity-nucleotide-fasta low_complexity_nucleotides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta +``` + +#### Save the low complexity peptides to a fasta + +Even if the nucleotide sequence may pass the complexity filter, the peptide +sequence may still be low complexity. As an example, all translated frames of +the sequence +`CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG` +would be considered low complexity, as it translates to either +`QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ` (5'3' Frame 1), +`SSSSSSSSSSSSSSSSSSSSSSSSSSSSS` (5'3' Frame 2), +`AAAAAAAAAAAAAAAAAAAAAAAAAAAAA` (5'3' Frame 3 and 3'5' Frame 3), +`LLLLLLLLLLLLLLLLLLLLLLLLLLLLLL` (3'5' Frame 1), +or `CCCCCCCCCCCCCCCCCCCCCCCCCCCCC` (3'5' Frame 2). As these sequences have few +k-mers and are difficult to assess for how "coding" they are, we ignore them. +Unlike for nucleotides where we look at runs of consecutive bases, we require +the translated peptide to contain greater than `(L - k + 1)/2` k-mers, where +`L` is the length of the sequence and `k` is the k-mer size. To save the +sequence of low-complexity peptides to a fasta, use the flag +`--low-complexity-peptides-fasta`. + +``` +khtools extract_coding --low-complexity-peptides-fasta low_complexity_peptides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta +``` diff --git a/docs/usage.rst b/docs/usage.rst index 5b26480a..e1e20d40 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -5,3 +5,15 @@ Usage To use Kmer-hashing tools in a project:: import khtools + +To create a bloom filter of sequences:: + + khtools bloom-filter --molecule protein --peptide-ksize 7 --save-as Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph Homo_sapiens.GRCh38.pep.subset.fa.gz + +To partition reads into coding/noncoding bins using the bloom filter:: + + khtools partition -- SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled.fq.gz Homo_sapiens.GRCh38.pep.all.fa.gz + +To create the bloom filter and partition the reads in one step:: + + khtools partition ~/code/kmer-hashing/extract_kmers/test-data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled.fq.gz ~/Downloads/Homo_sapiens.GRCh38.pep.all.fa.gz diff --git a/khtools/bloom_filter.py b/khtools/bloom_filter.py new file mode 100644 index 00000000..7195f027 --- /dev/null +++ b/khtools/bloom_filter.py @@ -0,0 +1,253 @@ +import math +import os + +import click +import khmer +import screed +from sourmash._minhash import hash_murmur +from tqdm import tqdm + +from khtools.compare_kmer_content import kmerize +from khtools.sequence_encodings import encode_peptide, VALID_PEPTIDE_MOLECULES + +# khmer Nodegraph features +DEFAULT_N_TABLES = 4 +DEFAULT_MAX_TABLESIZE = int(1e8) + +# Default k-mer sizes for different alphabets +DEFAULT_PROTEIN_KSIZE = 7 +DEFAULT_DAYHOFF_KSIZE = 11 +DEFAULT_HP_KSIZE = 21 + + +def per_read_false_positive_coding_rate(n_kmers_in_read, n_total_kmers=1e7, + n_hash_functions=DEFAULT_N_TABLES, + tablesize=DEFAULT_MAX_TABLESIZE): + exponent = - n_hash_functions * n_total_kmers / tablesize + print(f"exponent: {exponent}") + + # Probability that a single k-mer is randomly in the data + # per_kmer_fpr = math.pow(1 - math.exp(exponent), n_hash_functions) + + # Use built-in `exp1m` = exp - 1 + # - (exp - 1) = 1 - exp + per_kmer_fpr = math.pow(- math.expm1(exponent), n_hash_functions) + print(f"per kmer false positive rate: {per_kmer_fpr}") + + # Probability that the number of k-mers in the read are all false positives + per_read_fpr = math.pow(per_kmer_fpr, n_kmers_in_read) + return per_read_fpr + + +def load_nodegraph(*args, **kwargs): + try: + # khmer 2.1.1 + return khmer.load_nodegraph(*args, **kwargs) + except AttributeError: + # khmer 3+/master branch + return khmer.Nodegraph.load(*args, **kwargs) + + +# Cribbed from https://click.palletsprojects.com/en/7.x/parameters/ +class BasedIntParamType(click.ParamType): + name = "integer" + + def convert(self, value, param, ctx): + try: + if isinstance(value, int): + return value + if 'e' in value: + sigfig, exponent = value.split('e') + sigfig = float(sigfig) + exponent = int(exponent) + return int(sigfig * 10 ** exponent) + return int(value, 10) + except TypeError: + self.fail( + "expected string for int() conversion, got " + f"{value!r} of type {type(value).__name__}", + param, + ctx, + ) + except ValueError: + self.fail(f"{value!r} is not a valid integer", param, ctx) + + +BASED_INT = BasedIntParamType() + + +def make_peptide_bloom_filter(peptide_fasta, + peptide_ksize, + molecule, + n_tables=DEFAULT_N_TABLES, + tablesize=DEFAULT_MAX_TABLESIZE): + """Create a bloom filter out of peptide sequences""" + peptide_bloom_filter = khmer.Nodegraph(peptide_ksize, + tablesize, + n_tables=n_tables) + + with screed.open(peptide_fasta) as records: + for record in tqdm(records): + if '*' in record['sequence']: + continue + sequence = encode_peptide(record['sequence'], molecule) + try: + kmers = kmerize(sequence, peptide_ksize) + for kmer in kmers: + # Convert the k-mer into an integer + hashed = hash_murmur(kmer) + + # .add can take the hashed integer so we can hash the + # peptide kmer and add it directly + peptide_bloom_filter.add(hashed) + except ValueError: + # Sequence length is smaller than k-mer size + continue + return peptide_bloom_filter + + +def make_peptide_set(peptide_fasta, peptide_ksize, molecule): + """Create a python set out of peptide sequence k-mers + + For comparing to the bloom filter in storage and performance + """ + peptide_set = set([]) + + with screed.open(peptide_fasta) as records: + for record in tqdm(records): + if '*' in record['sequence']: + continue + sequence = encode_peptide(record['sequence'], molecule) + try: + kmers = kmerize(sequence, peptide_ksize) + peptide_set.update(kmers) + except ValueError: + # Sequence length is smaller than k-mer size + continue + return peptide_set + + +def maybe_make_peptide_bloom_filter(peptides, peptide_ksize, molecule, + peptides_are_bloom_filter, + n_tables=DEFAULT_N_TABLES, + tablesize=DEFAULT_MAX_TABLESIZE): + if peptides_are_bloom_filter: + click.echo( + f"Loading existing bloom filter from {peptides} and " + f"making sure the ksizes match", + err=True) + peptide_bloom_filter = load_nodegraph(peptides) + if peptide_ksize is not None: + try: + assert peptide_ksize == peptide_bloom_filter.ksize() + except AssertionError: + raise ValueError(f"Given peptide ksize ({peptide_ksize}) and " + f"ksize found in bloom filter " + f"({peptide_bloom_filter.ksize()}) are not" + f"equal") + else: + peptide_ksize = get_peptide_ksize(molecule, peptide_ksize) + click.echo( + f"Creating peptide bloom filter with file: {peptides}\n" + f"Using ksize: {peptide_ksize} and molecule: {molecule} " + f"...", + err=True) + peptide_bloom_filter = make_peptide_bloom_filter( + peptides, peptide_ksize, molecule=molecule, + n_tables=n_tables, tablesize=tablesize) + return peptide_bloom_filter + + +def maybe_save_peptide_bloom_filter(peptides, peptide_bloom_filter, molecule, + save_peptide_bloom_filter): + if save_peptide_bloom_filter: + ksize = peptide_bloom_filter.ksize() + + if isinstance(save_peptide_bloom_filter, str): + filename = save_peptide_bloom_filter + peptide_bloom_filter.save(save_peptide_bloom_filter) + else: + suffix = f'.molecule-{molecule}_ksize-{ksize}.bloomfilter.' \ + f'nodegraph' + filename = os.path.splitext(peptides)[0] + suffix + + click.echo(f"Writing peptide bloom filter to {filename}", err=True) + peptide_bloom_filter.save(filename) + click.echo("\tDone!", err=True) + + +@click.command() +@click.argument('peptides') +@click.option('--peptide-ksize', + default=None, type=int, + help="K-mer size of the peptide sequence to use. Defaults for" + " different molecules are, " + f"protein: {DEFAULT_PROTEIN_KSIZE}" + f", dayhoff: {DEFAULT_DAYHOFF_KSIZE}," + f" hydrophobic-polar: {DEFAULT_HP_KSIZE}") +@click.option('--molecule', + default='protein', + help="The type of amino acid encoding to use. Default is " + "'protein', but 'dayhoff' or 'hydrophobic-polar' can be " + "used") +@click.option('--save-as', + default=None, + help='If provided, save peptide bloom filter as this filename. ' + 'Otherwise, add ksize and molecule name to input filename.') +@click.option('--tablesize', type=BASED_INT, + default="1e8", + help='Size of the bloom filter table to use') +@click.option('--n-tables', type=int, + default=DEFAULT_N_TABLES, + help='Size of the bloom filter table to use') +def cli(peptides, peptide_ksize=None, molecule='protein', save_as=None, + tablesize=DEFAULT_MAX_TABLESIZE, n_tables=DEFAULT_N_TABLES): + """Make a peptide bloom filter for your peptides + + \b + Parameters + ---------- + reads : str + Sequence file of reads to filter + peptides : str + Sequence file of peptides + peptide_ksize : int + Number of characters in amino acid words + long_reads + verbose + + \b + Returns + ------- + + """ + # \b above prevents rewrapping of paragraph + peptide_ksize = get_peptide_ksize(molecule, peptide_ksize) + peptide_bloom_filter = make_peptide_bloom_filter(peptides, peptide_ksize, + molecule, + n_tables=n_tables, + tablesize=tablesize) + click.echo("\tDone!", err=True) + + save_peptide_bloom_filter = save_as if save_as is not None else True + maybe_save_peptide_bloom_filter( + peptides, + peptide_bloom_filter, + molecule, + save_peptide_bloom_filter=save_peptide_bloom_filter) + + +def get_peptide_ksize(molecule, peptide_ksize): + if molecule not in VALID_PEPTIDE_MOLECULES: + raise ValueError(f"{molecule} is not a valid protein encoding! " + f"Only one of 'protein', 'hydrophobic-polar', or" + f" 'dayhoff' can be specified") + + if peptide_ksize is None: + if molecule == 'protein': + peptide_ksize = DEFAULT_PROTEIN_KSIZE + elif molecule == 'dayhoff': + peptide_ksize = DEFAULT_DAYHOFF_KSIZE + elif molecule == 'hydrophobic-polar' or molecule == 'hp': + peptide_ksize = DEFAULT_HP_KSIZE + return peptide_ksize diff --git a/khtools/commandline.py b/khtools/commandline.py index f5054a5e..c1802081 100644 --- a/khtools/commandline.py +++ b/khtools/commandline.py @@ -10,25 +10,29 @@ import click # Within-module imports -from khtools.hello import hello - +from khtools.extract_coding import cli as extract_coding +from khtools.bloom_filter import cli as bloom_filter click.option = partial(click.option, show_default=True) settings = dict(help_option_names=['-h', '--help']) -@click.group(options_metavar='', subcommand_metavar='', + +@click.group(options_metavar='', + subcommand_metavar='', context_settings=settings) def cli(): """ Kmer hashing tools contains data cleaning and visualization code for + analyzing sequencing datasets at the k-mer level + Kmer hashing tools contains data cleaning and visualization code for analyzing kmer-hashing similarity matrices """ pass -cli.add_command(hello, name='hello') - +cli.add_command(extract_coding, name='extract-coding') +cli.add_command(bloom_filter, name='bloom-filter') if __name__ == "__main__": cli() diff --git a/khtools/extract_coding.py b/khtools/extract_coding.py new file mode 100644 index 00000000..53a6313d --- /dev/null +++ b/khtools/extract_coding.py @@ -0,0 +1,680 @@ +""" +extract_coding.py + +Partition reads into coding, noncoding, and low-complexity bins +""" +import json +import sys +import warnings + +from Bio.Seq import Seq +import click +import numpy as np +import pandas as pd +import screed +from sourmash._minhash import hash_murmur +from khtools.sequence_encodings import encode_peptide +from khtools.compare_kmer_content import kmerize +from khtools.bloom_filter import (maybe_make_peptide_bloom_filter, + maybe_save_peptide_bloom_filter, + DEFAULT_PROTEIN_KSIZE, + DEFAULT_DAYHOFF_KSIZE, DEFAULT_HP_KSIZE, + DEFAULT_N_TABLES, DEFAULT_MAX_TABLESIZE, + BASED_INT) +from tqdm import tqdm + +# Import modified 'os' module with LC_LANG set so click doesn't complain. +# The '# noqa: F401' line prevents the linter from complaining about the unused +# import. +DEFAULT_JACCARD_THRESHOLD = 0.5 +DEFAULT_HP_JACCARD_THRESHOLD = 0.8 +SEQTYPE_TO_ANNOUNCEMENT = { + "noncoding_nucleotide": + "nucleotide sequence from reads WITHOUT matches to " + "protein-coding peptides", + "coding_nucleotide": + "nucleotide sequence from reads WITH protein-coding translation" + " frame nucleotides", + "low_complexity_nucleotide": + "nucleotide sequence from low complexity (low entropy) reads", + "low_complexity_peptide": + "peptide sequence from low " + "complexity (low entropy) translated" + " reads" +} +SCORING_DF_COLUMNS = [ + 'read_id', 'jaccard_in_peptide_db', 'n_kmers', 'classification' +] + + +def validate_jaccard(ctx, param, value): + """Ensure Jaccard threshold is between 0 and 1""" + if value is None: + return value + try: + jaccard = float(value) + assert jaccard <= 1 + assert jaccard >= 0 + return jaccard + except (ValueError, AssertionError): + raise click.BadParameter(f'--jaccard-threshold needs to be a number' + f' between 0 and 1, but {value} was provided') + + +def write_fasta(file_handle, description, sequence): + file_handle.write(f">{description}\n{sequence}\n") + + +def open_and_announce(filename, seqtype, quiet=False): + if not quiet: + announcement = SEQTYPE_TO_ANNOUNCEMENT[seqtype] + click.echo(f"Writing {announcement} to {filename}", err=True) + return open(filename, 'w') + + +def three_frame_translation(seq, debug=False): + if debug: + warning_filter = 'default' + else: + warning_filter = 'ignore' + + with warnings.catch_warnings(): + warnings.simplefilter(warning_filter) + for frame in range(3): + translation = seq[frame:].translate() + yield translation + + +def three_frame_translation_no_stops(seq, debug=False, sign=1): + """Remove translations with stop codons & keep track of reading frame""" + return { + sign * (i + 1): t + for i, t in enumerate(three_frame_translation(seq, debug)) + if '*' not in t + } + + +def six_frame_translation_no_stops(seq, debug=False): + forward_translations = three_frame_translation_no_stops(seq, debug) + + # Sign=-1 sets the reading frames as negative to make it obvious they are + # from the reverse strand + reverse_translations = three_frame_translation_no_stops( + seq.reverse_complement(), debug, sign=-1) + forward_translations.update(reverse_translations) + return forward_translations + + +def score_single_translation(translation, + peptide_bloom_filter, + peptide_ksize, + molecule='protein', + verbose=True): + encoded = encode_peptide(translation, molecule) + kmers = list(set(kmerize(str(encoded), peptide_ksize))) + hashes = [hash_murmur(kmer) for kmer in kmers] + n_kmers = len(kmers) + n_kmers_in_peptide_db = sum(1 for h in hashes + if peptide_bloom_filter.get(h) > 0) + if verbose > 1: + click.echo(f"\ttranslation: \t{encoded}", err=True) + click.echo("\tkmers:", ' '.join(kmers), err=True) + + if verbose > 1: + kmers_in_peptide_db = {(k, h): peptide_bloom_filter.get(h) + for k, h in zip(kmers, hashes)} + # Print keys (kmers) only + click.echo(f"\tK-mers in peptide database:", err=True) + click.echo(kmers_in_peptide_db, err=True) + + fraction_in_peptide_db = n_kmers_in_peptide_db / n_kmers + + return fraction_in_peptide_db, n_kmers + + +def evaluate_is_fastp_low_complexity(seq, complexity_threshold=0.3): + """Use fastp's definition of complexity + + By this definition, low complexity sequence is defined by consecutive runs + of same base in a row, e.g. + CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACAC + is low complexity. The threshold is 0.3 as used in the fastp prpject: + https://github.com/OpenGene/fastp + + Parameters + ---------- + seq : str + Sequence to compute complexity on + complexity_threshold : float, defaault 0.3 + Value between 0 and 1. The default is 0.3 because that is the default + in the command line program fastp + + Returns + ------- + is_low_complexity : bool + Whether or not the sequence passes the complexity threshold + """ + complexity = compute_fastp_complexity(seq) + return complexity < complexity_threshold + + +def compute_fastp_complexity(seq): + n_different_consecutively = sum(1 for i in range(len(seq) - 1) + if seq[i] != seq[i + 1]) + complexity = n_different_consecutively / len(seq) + return complexity + + +def evaluate_is_kmer_low_complexity(sequence, ksize): + """Check if sequence is low complexity, i.e. mostly repetitive + + By this definition, the sequence is not complex if its number of unique + k-mers is smaller than half the number of expected k-mers + """ + with warnings.catch_warnings(): + warnings.filterwarnings('ignore') + # Ignore Biopython warning of seq objects being strings now + try: + kmers = kmerize(sequence, ksize) + except ValueError: + # k-mer size is larger than sequence + return None, None + n_kmers = len(kmers) + n_possible_kmers_on_sequence = len(sequence) - ksize + 1 + min_kmer_entropy = n_possible_kmers_on_sequence / 2 + is_low_complexity = n_kmers <= min_kmer_entropy + return is_low_complexity, n_kmers + + +def score_single_read(sequence, + peptide_bloom_filter, + peptide_ksize, + molecule='protein', + verbose=True, + jaccard_threshold=0.9, + description=None, + noncoding_file_handle=None, + coding_nucleotide_file_handle=None, + low_complexity_peptide_file_handle=None): + """Predict whether a nucleotide sequence could be protein-coding + + Parameters + ---------- + sequence : str + Nucleotide sequence to predict on + peptide_bloom_filter : khmer.Nodegraph + Database of known peptide k-mers from a well-studied organism, e.g. + human protein-coding sequences. Must have been built on peptides using + the same k-mer size and molecular encoding as specified here, otherwise + the results will make no sense + peptide_ksize : int + Length of the peptide words in sequence. Must match the k-mer size used + for the peptide_bloom_filter otherwise nothing will match, or only + false positives will match. + molecule : str + One of "protein"|"peptide", "dayhoff", or "hydrophobic-polar"|"hp" to + encode the protein-coding space. Where "protein"|"peptide" is the + original 20-letter amino acid encoding, Dayhoff ("dayhoff") is a lossy + 6-letter encoding that categorizes the amino acids into: + 1. Cysteine, + 2. Small (A, G, P, S, T) + 3. Acid and Amide (D, E, N, Q) + 4. Basic (H, K, R) + 5. Hydrophobic (I, L, M, V) + 6. Aromatic (F, W, Y) + Hydrophobic-polar maps to a mere two categories: + 1. Hydrophobic (A, F, G, I, L, M, P, V, W, Y) + 2. Polar (C, D, E, H, K, N, Q, R, S, T) + verbose : bool + Whether or not to print a lot of stuff + jaccard_threshold : float + Value between 0 and 1. By default, the (empirically-chosen) "best" + threshold is chosen for each molecule. For "protein" and "dayhoff", + the default is 0.5, and for "hydrophobic-polar," it is 0.8, since it is + so lossy it's more likely to match random sequence. These thresholds + were determined empirically with a pre-chosen human RNA-seq dataset and + human peptides. + description : str + The identifier in the sequence file, i.e. the name or descriptor of the + sequence + noncoding_file_handle : None or file + If not None, write noncoding nucleotide reads to this file handle + coding_nucleotide_file_handle : None or file + If not None, write coding nucleotides reads to this file handle + low_complexity_peptide_file_handle : None or file + If not None, write low complexity peptide sequences to this file handle + + Returns + ------- + max_fraction_in_peptide_db : float + Of all reading frames, the maximum number of k-mers that matches the + peptide database + max_n_kmers: int + Of all reading frames, the maximum number of k-mers observed in the + translated, encoded peptide + special_case : str or None + Additional message to write in the output csv describing the reason + why this sequence is or isn't protein-coding + """ + # Convert to BioPython sequence object for translation + seq = Seq(sequence) + + # In case this is used from the Python API and the default threshold isn't + # specified + jaccard_threshold = get_jaccard_threshold(jaccard_threshold, molecule) + + # Convert to BioPython sequence object for translation + translations = six_frame_translation_no_stops(seq) + # For all translations, use the one with the maximum number of k-mers + # in the databse + max_n_kmers = 0 + max_fraction_in_peptide_db = 0 + if len(translations) == 0: + return np.nan, np.nan, "No translation frames without stop codons" + + translations = { + frame: translation + for frame, translation in translations.items() + if len(translation) > peptide_ksize + } + if len(translations) == 0: + return np.nan, np.nan, "All translations shorter than peptide k-mer " \ + "size + 1" + + for frame, translation in translations.items(): + # Convert back to string + translation = str(translation) + + # Maybe reencode to dayhoff/hp space + encoded = encode_peptide(translation, molecule) + + is_kmer_low_complexity, n_kmers = evaluate_is_kmer_low_complexity( + encoded, peptide_ksize) + + if is_kmer_low_complexity: + maybe_write_fasta(description + f" translation_frame: {frame}", + low_complexity_peptide_file_handle, translation) + return np.nan, n_kmers, f"Low complexity peptide in {molecule}" \ + " encoding" + + fraction_in_peptide_db, n_kmers = score_single_translation( + encoded, + peptide_bloom_filter, + peptide_ksize, + molecule=molecule, + verbose=verbose) + + # Save the highest jaccard + max_fraction_in_peptide_db = max(max_fraction_in_peptide_db, + fraction_in_peptide_db) + + if max_fraction_in_peptide_db == fraction_in_peptide_db: + # Update n_kmers if this is the best translation frame + max_n_kmers = n_kmers + if fraction_in_peptide_db > jaccard_threshold: + if verbose: + click.echo(f"\t{translation} is above {jaccard_threshold}", + err=True) + seqname = f'{description} translation_frame: {frame} ' \ + f'jaccard: {fraction_in_peptide_db}' + write_fasta(sys.stdout, seqname, translation) + maybe_write_fasta(seqname, coding_nucleotide_file_handle, sequence) + + if max_fraction_in_peptide_db <= jaccard_threshold: + maybe_write_fasta(description, noncoding_file_handle, sequence) + return max_fraction_in_peptide_db, max_n_kmers, None + + +def maybe_write_fasta(description, file_handle, sequence): + """Write fasta to file handle if it is not None""" + if file_handle is not None: + write_fasta(file_handle, description, sequence) + + +def score_reads(reads, + peptide_bloom_filter, + jaccard_threshold=None, + molecule='protein', + verbose=False, + coding_nucleotide_fasta=None, + noncoding_nucleotide_fasta=None, + low_complexity_nucleotide_fasta=None, + low_complexity_peptide_fasta=None): + """Assign a coding score to each read. Where the magic happens.""" + jaccard_threshold = get_jaccard_threshold(jaccard_threshold, molecule) + peptide_ksize = peptide_bloom_filter.ksize() + + scoring_lines = [] + nucleotide_ksize = 3 * peptide_ksize + + fastas, file_handles = maybe_open_fastas(coding_nucleotide_fasta, + low_complexity_nucleotide_fasta, + low_complexity_peptide_fasta, + noncoding_nucleotide_fasta) + with screed.open(reads) as records: + for record in tqdm(records): + description = record['name'] + sequence = record['sequence'] + if verbose: + print(description) + + jaccard, n_kmers, special_case = maybe_score_single_read( + description, fastas, file_handles, jaccard_threshold, molecule, + nucleotide_ksize, peptide_bloom_filter, peptide_ksize, + sequence, verbose) + + line = get_coding_score_line(description, jaccard, + jaccard_threshold, n_kmers, + special_case) + scoring_lines.append(line) + + maybe_close_files(file_handles) + + # Concatenate all the lines into a single dataframe + scoring_df = pd.DataFrame(scoring_lines, columns=SCORING_DF_COLUMNS) + return scoring_df + + +def get_jaccard_threshold(jaccard_threshold, molecule): + if jaccard_threshold is None: + if molecule == 'hp' or molecule == 'hydrophobic-polar': + jaccard_threshold = DEFAULT_HP_JACCARD_THRESHOLD + else: + jaccard_threshold = DEFAULT_JACCARD_THRESHOLD + return jaccard_threshold + + +def maybe_score_single_read(description, fastas, file_handles, + jaccard_threshold, molecule, nucleotide_ksize, + peptide_bloom_filter, peptide_ksize, sequence, + verbose): + """Check if read is low complexity/too short, otherwise score it""" + # Check if nucleotide sequence is low complexity + is_fastp_low_complexity = evaluate_is_fastp_low_complexity(sequence) + if is_fastp_low_complexity: + n_kmers = np.nan + jaccard, n_kmers, special_case = too_short_or_low_complexity( + description, fastas, n_kmers, sequence) + else: + jaccard, n_kmers, special_case = score_single_read( + sequence, + peptide_bloom_filter, + peptide_ksize, + molecule, + verbose, + jaccard_threshold=jaccard_threshold, + description=description, + noncoding_file_handle=file_handles['noncoding_nucleotide'], + coding_nucleotide_file_handle=file_handles['coding_nucleotide'], + low_complexity_peptide_file_handle=file_handles[ + 'low_complexity_peptide']) + + if verbose > 1: + click.echo(f"Jaccard: {jaccard}, n_kmers = {n_kmers}", err=True) + return jaccard, n_kmers, special_case + + +def too_short_or_low_complexity(description, fastas, n_kmers, sequence): + if n_kmers > 0: + jaccard = np.nan + special_case = "Low complexity nucleotide" + maybe_write_fasta(description, fastas['low_complexity_nucleotide'], + sequence) + else: + jaccard = np.nan + n_kmers = np.nan + special_case = 'Read length was shorter than 3 * peptide ' \ + 'k-mer size' + return jaccard, n_kmers, special_case + + +def maybe_close_files(file_handles): + for file_handle in file_handles.values(): + if file_handle is not None: + file_handle.close() + + +def get_coding_score_line(description, jaccard, jaccard_threshold, n_kmers, + special_case): + if special_case is not None: + line = [description, jaccard, n_kmers, special_case] + elif jaccard > jaccard_threshold: + line = [description, jaccard, n_kmers, 'Coding'] + else: + line = [description, jaccard, n_kmers, 'Non-coding'] + return line + + +def maybe_open_fastas(coding_nucleotide_fasta, low_complexity_nucleotide_fasta, + low_complexity_peptide_fasta, + noncoding_nucleotide_fasta): + fastas = { + "noncoding_nucleotide": noncoding_nucleotide_fasta, + "coding_nucleotide": coding_nucleotide_fasta, + "low_complexity_nucleotide": low_complexity_nucleotide_fasta, + "low_complexity_peptide": low_complexity_peptide_fasta + } + file_handles = {} + for seqtype, fasta in fastas.items(): + if fasta is not None: + file_handles[seqtype] = open_and_announce(fasta, seqtype) + else: + file_handles[seqtype] = None + return fastas, file_handles + + +def maybe_write_csv(coding_scores, csv): + if csv: + click.echo(f"Writing coding scores of reads to {csv}", err=True) + coding_scores.to_csv(csv, index=False) + + +def maybe_write_json_summary(coding_scores, json_summary): + if json_summary: + classification_value_counts = \ + coding_scores.classification.value_counts() + classification_percentages = 100 * classification_value_counts / \ + classification_value_counts.sum() + + metadata = { + 'jaccard_info': + coding_scores.jaccard_in_peptide_db.describe().to_dict(), + 'classification_value_counts': + classification_value_counts.to_dict(), + 'classification_percentages': + classification_percentages.to_dict() + } + with open(json_summary, 'w') as f: + click.echo(f"Writing extract_coding summary to {json_summary}") + json.dump(metadata, fp=f) + + +@click.command() +@click.argument('peptides', nargs=1) +@click.argument('reads', nargs=-1) +@click.option('--peptide-ksize', + default=None, + help="K-mer size of the peptide sequence to use. Defaults for" + " different molecules are, " + f"protein: {DEFAULT_PROTEIN_KSIZE}" + f", dayhoff: {DEFAULT_DAYHOFF_KSIZE}," + f" hydrophobic-polar: {DEFAULT_HP_KSIZE}") +@click.option("--save-peptide-bloom-filter", + is_flag=True, + default=False, + help="If specified, save the peptide bloom filter. " + "Default filename is the name of the fasta file plus a " + "suffix denoting the protein encoding and peptide ksize") +@click.option('--peptides-are-bloom-filter', + is_flag=True, + default=False, + help="Peptide file is already a bloom filter") +@click.option('--jaccard-threshold', + default=None, type=click.FLOAT, callback=validate_jaccard, + help="Minimum fraction of peptide k-mers from read in the " + "peptide database for this read to be called a " + f"'coding read'. Default: {DEFAULT_JACCARD_THRESHOLD} for" + f" protein and dayhoff encodings, and " + f"{DEFAULT_HP_JACCARD_THRESHOLD} for hydrophobic-polar " + f"(hp) encoding") +@click.option('--molecule', + default='protein', + help="The type of amino acid encoding to use. Default is " + "'protein', but 'dayhoff' or 'hydrophobic-polar' can be " + "used") +@click.option('--csv', + default=False, + help='Name of csv file to write with all sequence reads and ' + 'their coding scores') +@click.option('--json-summary', + default=False, + help='Name of json file to write summarization of coding/' + 'noncoding/other categorizations, the ' + 'min/max/mean/median/stddev of Jaccard scores, and other') +@click.option("--coding-nucleotide-fasta", + help="If specified, save the coding nucleotides to this file") +@click.option("--noncoding-nucleotide-fasta", + help="If specified, save the noncoding nucleotides to this file") +@click.option("--low-complexity-nucleotide-fasta", + help="If specified, save the low-complexity nucleotides to this" + " file") +@click.option("--low-complexity-peptide-fasta", + help="If specified, save the low-complexity peptides to this " + "file") +@click.option('--tablesize', type=BASED_INT, + default="1e8", + help='Size of the bloom filter table to use') +@click.option('--n-tables', type=int, + default=DEFAULT_N_TABLES, + help='Size of the bloom filter table to use') +@click.option("--long-reads", + is_flag=True, + help="If set, then only considers reading frames starting with " + "start codon (ATG) and ending in a stop codon " + "(TAG, TAA, TGA)") +@click.option("--verbose", is_flag=True, help="Print more output") +def cli(peptides, + reads, + peptide_ksize=None, + save_peptide_bloom_filter=True, + peptides_are_bloom_filter=False, + jaccard_threshold=None, + molecule='protein', + csv=False, + json_summary=False, + coding_nucleotide_fasta=None, + noncoding_nucleotide_fasta=None, + low_complexity_nucleotide_fasta=None, + low_complexity_peptide_fasta=None, + tablesize=DEFAULT_MAX_TABLESIZE, n_tables=DEFAULT_N_TABLES, + long_reads=False, + verbose=False): + """Writes coding peptides from reads to standard output + + \b + Sane defaults for peptide_ksize for different peptide encodings: + - with "protein" or "peptide" --> --peptide-ksize = 5-10 + 7 is pretty universal but can go down to 5 for less species specificity + and up to 10 to be very specific + - with "dayhoff" --> --peptide-ksize = 10-15 + - with "hydrophobic-polar" or "hp" --> --peptide-ksize = 15-21 + 15 is pretty good but can do up to 21 + + \b + Parameters + ---------- + reads : str + Sequence file of reads to filter + peptides : str + Sequence file of peptides + peptide_ksize : int + Number of characters in amino acid words + save_peptide_bloom_filter : str or bool + Whether or not to save the created bloom filter to file. If a string, + save to this filename + peptides_are_bloom_filter : bool + Input ilfe of peptides is already a bloom filter + jaccard_threshold : float + Value between 0 and 1. By default, the (empirically-chosen) "best" + threshold is chosen for each molecule. For "protein" and "dayhoff", + the default is 0.5, and for "hydrophobic-polar," it is 0.8, since it is + so lossy it's more likely to match random sequence. These thresholds + were determined empirically with a pre-chosen human RNA-seq dataset and + human peptides. + molecule : str + One of "protein"|"peptide", "dayhoff", or "hydrophobic-polar"|"hp" to + encode the protein-coding space. Where "protein"|"peptide" is the + original 20-letter amino acid encoding, Dayhoff ("dayhoff") is a lossy + 6-letter encoding that categorizes the amino acids into: + 1. Cysteine, + 2. Small (A, G, P, S, T) + 3. Acid and Amide (D, E, N, Q) + 4. Basic (H, K, R) + 5. Hydrophobic (I, L, M, V) + 6. Aromatic (F, W, Y) + Hydrophobic-polar maps to a mere two categories: + 1. Hydrophobic (A, F, G, I, L, M, P, V, W, Y) + 2. Polar (C, D, E, H, K, N, Q, R, S, T) + csv : str + Save the coding scores as a csv to this file + long_reads : bool -- NOT IMPLEMENTED!! + Input sequencing reads are long reads. Not implemented, but the plan + is, instead of doing 6-frame translation as on the short reads, test + all ATG (start codon) to stop codon reading frames for the one(s) that + matches the known peptide database best. Unknown whether this requires + new thresholds + coding_nucleotide_fasta : None or str + If specified, save coding nucleotide sequence to this file + noncoding_nucleotide_fasta : None or str + If specified, save noncoding nucleotide sequence to this file + low_complexity_nucleotide_fasta : None or str + If specified, save low complexity nucleotide sequence to this file + low_complexity_peptide_fasta : None or str + If specified, save low complexity peptide sequence to this file + verbose : bool + Whether or not to print lots of stuff. Can specify multiple, e.g. -vv + if you really like having everything in stdout + + \b + Returns + ------- + coding_peptides : str + Outputs a fasta-formatted sequence of translated peptides + """ + # \b above prevents re-wrapping of paragraphs + + if long_reads: + raise NotImplementedError("Not implemented! ... yet :)") + + peptide_bloom_filter = maybe_make_peptide_bloom_filter( + peptides, peptide_ksize, molecule, peptides_are_bloom_filter, + n_tables=n_tables, tablesize=tablesize) + click.echo("\tDone!", err=True) + + if not peptides_are_bloom_filter: + maybe_save_peptide_bloom_filter(peptides, peptide_bloom_filter, + molecule, save_peptide_bloom_filter) + + dfs = [] + for reads_file in reads: + df = score_reads( + reads_file, + peptide_bloom_filter, + jaccard_threshold=jaccard_threshold, + molecule=molecule, + verbose=verbose, + coding_nucleotide_fasta=coding_nucleotide_fasta, + noncoding_nucleotide_fasta=noncoding_nucleotide_fasta, + low_complexity_nucleotide_fasta=low_complexity_nucleotide_fasta, + low_complexity_peptide_fasta=low_complexity_peptide_fasta) + df['filename'] = reads_file + dfs.append(df) + + coding_scores = pd.concat(dfs, ignore_index=True) + + maybe_write_csv(coding_scores, csv) + maybe_write_json_summary(coding_scores, json_summary) + + +if __name__ == '__main__': + cli() diff --git a/khtools/hello.py b/khtools/hello.py deleted file mode 100644 index 7a949253..00000000 --- a/khtools/hello.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -This is an example subcommand for a git-like interface - -Cribbed from the Click documentation https://click.palletsprojects.com/en/7.x/ -""" -import random - -import click -from tqdm import tqdm - -# Import modified 'os' module with LC_LANG set so click doesn't complain. -# The '# noqa: F401' line prevents the linter from complaining about the unused -# import. -from .os_utils import os # noqa: F401 - - -COLORS = 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan' - - -@click.command() -@click.option('--count', default=5, help='Number of greetings.') -@click.option('--name', prompt='Your name', - help='The person to greet.') -def hello(count, name): - """Simple program that greets NAME for a total of COUNT times, in color.""" - for x in tqdm(range(count)): - # note that colorama.init() doesn't need to be called for the colors - # to work - click.echo(click.style('Hello %s!' % name, fg=random.choice(COLORS))) diff --git a/khtools/jupyter_utils.py b/khtools/jupyter_utils.py index cc07530a..c9889f58 100644 --- a/khtools/jupyter_utils.py +++ b/khtools/jupyter_utils.py @@ -4,7 +4,6 @@ import ipykernel import requests - from requests.compat import urljoin from notebook.notebookapp import list_running_servers diff --git a/khtools/os_utils.py b/khtools/os_utils.py index ab0ad26b..e9252ddc 100644 --- a/khtools/os_utils.py +++ b/khtools/os_utils.py @@ -1,7 +1,6 @@ import os import subprocess - # Set input language USA unicode encoding setting # Necessary because click assumes ascii input unless otherwise specified # https://click.palletsprojects.com/en/7.x/python3/ @@ -60,7 +59,8 @@ def get_stdout_stderr_from_command(command): lines : list Newline-separated strings from output of command """ - result = subprocess.run(command, stdout=subprocess.PIPE, + result = subprocess.run(command, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout = decode(result.stdout) stderr = decode(result.stderr) diff --git a/khtools/sequence_encodings.py b/khtools/sequence_encodings.py index eba15448..cf192869 100644 --- a/khtools/sequence_encodings.py +++ b/khtools/sequence_encodings.py @@ -1,5 +1,5 @@ DNA_ALPHABET = "A", "C", "G", "T" -AMINO_ACID_SINGLE_LETTERS = "R", "H", "K", "D", "E", "S", "T", "N", "Q", "C",\ +AMINO_ACID_SINGLE_LETTERS = "R", "H", "K", "D", "E", "S", "T", "N", "Q", "C", \ "G", "P", "A", "V", "I", "L", "M", "F", "Y", "W" DAYHOFF_MAPPING = { "C": "a", @@ -81,16 +81,16 @@ "Y": "h", # Hydrophilic - polar - "N": 'p', "C": 'p', - "S": "p", - "T": "p", - "D": "p", + "D": 'p', "E": "p", - "R": "p", "H": "p", "K": "p", - "Q": "p" + "N": "p", + "Q": "p", + "R": "p", + "S": "p", + "T": "p" } BOTVINNIK_MAPPING = { # Small and hydrophobic @@ -131,24 +131,10 @@ "H": "k", "P": "m" } -PURINE_PYRIMIDINE_MAPPING = { - "A": "R", - "C": "Y", - "G": "R", - "T": "Y" -} -AMINO_KETO_MAPPING = { - "A": "M", - "C": "M", - "G": "K", - "T": "K" -} -WEAK_STRONG_MAPPING = { - "A": "W", - "C": "S", - "G": "S", - "T": "W" -} + +PURINE_PYRIMIDINE_MAPPING = {"A": "R", "C": "Y", "G": "R", "T": "Y"} +AMINO_KETO_MAPPING = {"A": "M", "C": "M", "G": "K", "T": "K"} +WEAK_STRONG_MAPPING = {"A": "W", "C": "S", "G": "S", "T": "W"} AMINO_KETO_TRANSLATION = str.maketrans(AMINO_KETO_MAPPING) WEAK_STRONG_TRANSLATION = str.maketrans(WEAK_STRONG_MAPPING) PURINE_PYRIMIDINE_TRANSLATION = str.maketrans(PURINE_PYRIMIDINE_MAPPING) @@ -158,8 +144,11 @@ BOTVINNIK_TRANSLATION = str.maketrans(BOTVINNIK_MAPPING) +VALID_PEPTIDE_MOLECULES = 'protein', 'peptide', 'dayhoff', \ + 'hydrophobic-polar', 'hp' # Nucleic acid mappings + def amino_keto_ize(seq): return seq.translate(AMINO_KETO_TRANSLATION) @@ -187,3 +176,16 @@ def hpize(seq): def botvinnikize(seq): return seq.translate(BOTVINNIK_TRANSLATION) + + +def encode_peptide(peptide_sequence, molecule): + if molecule == 'dayhoff': + return dayhoffize(peptide_sequence) + elif molecule == 'hydrophobic-polar' or molecule == 'hp': + return hpize(peptide_sequence) + elif molecule in VALID_PEPTIDE_MOLECULES: + return peptide_sequence + else: + raise ValueError(f"{molecule} is not a valid amino acid encoding, " + "only " + "{', '.join(VALID_PEPTIDE_MOLECULES} can be used") diff --git a/khtools/tests/conftest.py b/khtools/tests/conftest.py deleted file mode 100644 index 21f77e68..00000000 --- a/khtools/tests/conftest.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - -import pytest - - -""" -conftest.py contains fixtures or functions-turned-variables that can be -used in any test -""" - - -@pytest.fixture -def data_folder(): - """Absolute path to where test data is stored""" - return os.path.join(os.path.abspath(os.path.dirname(__file__)), - './data') diff --git a/khtools/tests/test_hello.py b/khtools/tests/test_hello.py deleted file mode 100755 index 70b13577..00000000 --- a/khtools/tests/test_hello.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -test_hello ----------------------------------- - -Tests for `khtools` module. -""" - -from click.testing import CliRunner - - -def test_hello(): - from khtools.hello import hello - - runner = CliRunner() - result = runner.invoke(hello, input="Rosalind Franklin") - - assert result.exit_code == 0 - assert result.output.count("Hello Rosalind Franklin") == 5 - - -def test_hello_name(): - from khtools.hello import hello - - runner = CliRunner() - result = runner.invoke(hello, ["--name", "Rosalind"]) - - assert result.exit_code == 0 - assert result.output.count("Hello Rosalind") == 5 - - -def test_hello_count(): - from khtools.hello import hello - - runner = CliRunner() - result = runner.invoke(hello, ["--count", "10", - "--name", "Rosalind"]) - - assert result.exit_code == 0 - assert result.output.count("Hello Rosalind") == 10 diff --git a/setup.cfg b/setup.cfg index 5e409001..38734a18 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,5 @@ [wheel] universal = 1 + +[yapf] +based_on_style = pep8 diff --git a/khtools/tests/__init__.py b/tests/__init__.py similarity index 100% rename from khtools/tests/__init__.py rename to tests/__init__.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..471ffd2d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,94 @@ +import os + +import pytest +""" +conftest.py contains fixtures or functions-turned-variables that can be +used in any test +""" +from khtools.bloom_filter import DEFAULT_PROTEIN_KSIZE, \ + DEFAULT_DAYHOFF_KSIZE, DEFAULT_HP_KSIZE + + +@pytest.fixture +def data_folder(): + """Absolute path to where test data is stored""" + return os.path.join(os.path.abspath(os.path.dirname(__file__)), './data') + + +@pytest.fixture +def peptide_fasta(data_folder): + filename = os.path.join(data_folder, 'bloom_filter', + 'Homo_sapiens.GRCh38.pep.subset.fa.gz') + return filename + + +@pytest.fixture +def adversarial_peptide_fasta(data_folder): + filename = os.path.join(data_folder, 'bloom_filter', + 'Homo_sapiens.GRCh38.pep.first1000lines.fa') + return filename + + +@pytest.fixture(params=['normal', 'adversarial']) +def variable_peptide_fasta(request, peptide_fasta, adversarial_peptide_fasta): + if request.param == 'normal': + return peptide_fasta + else: + return adversarial_peptide_fasta + + +# Tie the molecule name to its default ksize to make sure we keep getting the +# right sequences +@pytest.fixture(params=[('protein', DEFAULT_PROTEIN_KSIZE), + ('dayhoff', DEFAULT_DAYHOFF_KSIZE), + pytest.param(('dayhoff', DEFAULT_PROTEIN_KSIZE), + marks=pytest.mark.xfail), + ('hydrophobic-polar', DEFAULT_HP_KSIZE), + pytest.param( + ('hydrophobic-polar', DEFAULT_PROTEIN_KSIZE), + marks=pytest.mark.xfail)], + ids=[ + 'protein_default_ksize', 'dayhoff_default_ksize', + 'dayhoff_protein_ksize_xfail', 'hp_default_ksize', + 'hp_protein_ksize_xfail' +]) +def molecule_ksize(request): + return request.param + + +@pytest.fixture +def peptide_ksize(molecule_ksize): + return molecule_ksize[1] + + +@pytest.fixture +def molecule(molecule_ksize): + return molecule_ksize[0] + + +@pytest.fixture +def peptide_bloom_filter_path(data_folder, molecule, peptide_ksize): + filename = os.path.join( + data_folder, 'bloom_filter', + f'Homo_sapiens.GRCh38.pep.subset.molecule-{molecule}_' + f'ksize-{peptide_ksize}.bloomfilter.nodegraph' + ) + return filename + + +@pytest.fixture +def peptide_bloom_filter(peptide_bloom_filter_path, peptide_fasta, molecule, + peptide_ksize): + from khtools.bloom_filter import load_nodegraph + """Load bloom filter from path if exists, otherwise, make it""" + try: + return load_nodegraph(peptide_bloom_filter_path) + except (FileNotFoundError, OSError): + from khtools.bloom_filter import make_peptide_bloom_filter + + bloom_filter = make_peptide_bloom_filter(peptide_fasta, + peptide_ksize, + molecule, + tablesize=1e6) + bloom_filter.save(peptide_bloom_filter_path) + return bloom_filter diff --git a/khtools/tests/data/.gitkeep b/tests/data/.gitkeep similarity index 100% rename from khtools/tests/data/.gitkeep rename to tests/data/.gitkeep diff --git a/khtools/tests/data/ENSP00000354687.pkl b/tests/data/ENSP00000354687.pkl similarity index 100% rename from khtools/tests/data/ENSP00000354687.pkl rename to tests/data/ENSP00000354687.pkl diff --git a/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_first20.fq.gz b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_first20.fq.gz new file mode 100644 index 00000000..5cd527fb Binary files /dev/null and b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_first20.fq.gz differ diff --git a/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq new file mode 100644 index 00000000..9e8ced6e --- /dev/null +++ b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq @@ -0,0 +1,92 @@ +@SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1 +CGCTTGCTTAATACTGACATCAATAATATTAGGAAAATCGCAATATAACTGTAAATCCTGTTCTGTC ++ +322/2415652337555776752654675357764447564646644378654364939545:;538 +@SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1 +TCTAGAATGTGAAATAACGTACTTCATGTGTCTTCTTACCAAAAATACCAACGATAAGGGGAAAAGCCATC ++ +-0226727656145464554477797863768459454565555453855566855376368750886647 +@SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1 +CAATCATCATCACTTTCTAATTCCAGAATATTTTCATCACCCCAAAAAGAAATCCTAAATCCATTAGC ++ +2//004684572653325355467595624554598657657663644:6433;675575:6936684 +@SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1 +CAAAAGTGAAGACCTCCCTGGGGTCTTCAAAGACAGCCTTTGCTCTCCATGTAGCCAATGGTGCTCT ++ +6022356557706648784564628446554755486554756596986885587756554286585 +@SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1 +GTAACCCTTAATTCCTCAGAAAAAATAGACAACATAGTGGAGTGGGATGGAGGAAC ++ +30332625662567434687545364375744766473573546655765668745 +@SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1 +CTGTGATTATTTTTTTTCCTAGCATTTGTTCTTTCAGAAAAGGACTGAACTCTAAATTCTGGACTTGAAGACTG ++ +5350:57437356354566558756576348866:7;6643836778525:85667688<86667477475556 +@SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1 +GTTTGATTCTTGACAATTTCTTCTGGAACAAGTCTTTCATATACATTAGACTGGTATCATTGAGTTCTGAGC ++ +21115386673726345758678888346647476567794718586294896426:777786746;88557 +@SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1 +CACAAGGCTAACATACACAATCTGTAACACGAGATGGATAGCACACACATATGACACAATTTC ++ +61452257562856465866687736155375878767359637353643596776677544; +@SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1 +CTGTAGTGTGGTCCATTCCCAGACAGCAACATGCAAGAATAAGTTTACAATACACTCAGCCCTTCTG ++ +324/369683845456484777454966366774539647446284364472965687967666555 +@SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1 +TAGGAAAATAGTAATATTTGCAACTTATGAATGATAAGTCAGAAAAGTTACATGGAATGTTAAATTTT ++ +,/47363256812646557466668738876:78754746593546:554625679796776775856 +@SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1 +CTGATAGACTGAAACTGGTTTTGTTATTCTTAACGTTCTCCAGTCTGCACTCTGCTGTGCTGTCTGTGCTC ++ +33322463673034336624677544645544414254565543869624775658457667567637565 +@SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1 +TTAAGTTCTAGTCTGTGAGCACTTGTAGTTCAATAATCGTCATCTTCATCAGAGTCCATTACTTTTCTTCTGTTG ++ +-4215189658489557777668:7529368673668567987;:6;55877774;795647=768=87::4265 +@SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1 +AAATAAGGTAACATTTAACAATAATCTGATACACATAAATAGAGAAAGAGCAATTGATAAAGTAAATG ++ ++/123653252555435435755265765346547543473736454675656677;5566:343676 +@SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1 +CTTGGATCCACCATACTCAAGAGTTATTACACAAAGGGAAACAGAAAATAACCAAATGACATCAGAA ++ +42065467567555352764747445673547754756442445353345577667875987:6676 +@SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1 +CCAACACATCCAATTTGTATTTTCTTAAATATGTGTTTCTTAGGTATCTAAGGATACATGAGCGAGCCC ++ +3/221611467247456247465754444476666257966444376:775879558866696749866 +@SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1 +ATCTCTTGTAATAATTCAACATATTCCCTGGCTATTAACTAATTTCCAAGCCTGAACTGTCA ++ ++2211175462223332546843655755866666343355754885535757574685477 +@SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1 +TGAGGCAAACAGCAAGAGTAAGCAGTGTTACTTGCAGGTACTTTGGTTAATGTTGATTTAAATTTTCATG ++ ++31546132745663636164653655113459674751345574646339858877:63747864:796 +@SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1 +CAATGCCGTGCCAGTGGAGACTGTTCTCGTATGCC ++ +7233774594<778467675:87567366425937 +@SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1 +CCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAAC ++ +3/032525448575344564775543445846735376486767:5786677655;5745767657556737 +@SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1 +CAGGAATGAGGCCCCGACTAAAATTCGCTGCAAAAGCCCAAAATCTAGTTAGCATAAATTCCTCAGACATG ++ +4/3525833147443336366546545547666638656745567547545878665656;8683:86795 +@SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1 +ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ++ +(04147:;:9<<:7;88<>=@>>8<;;<=;C;>;:5:;9<<::6@;E;?:C@=:9:67 +@SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1 +CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACACCC ++ +2263688B;9<<9;=;9<=><:;=:@<@<<;@;S5:;;MENSP00000488240.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142847306:142847317:1 gene:ENSG00000282253.1 transcript:ENST00000631435.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158] +GTGG +>ENSP00000451042.1 pep chromosome:GRCh38:14:22438547:22438554:1 gene:ENSG00000223997.1 transcript:ENST00000415118.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD1 description:T cell receptor delta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12254] +EI +>ENSP00000452494.1 pep chromosome:GRCh38:14:22449113:22449125:1 gene:ENSG00000228985.1 transcript:ENST00000448914.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD3 description:T cell receptor delta diversity 3 [Source:HGNC Symbol;Acc:HGNC:12256] +TGGY +>ENSP00000451515.1 pep chromosome:GRCh38:14:22439007:22439015:1 gene:ENSG00000237235.2 transcript:ENST00000434970.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD2 description:T cell receptor delta diversity 2 [Source:HGNC Symbol;Acc:HGNC:12255] +PSY +>ENSP00000487941.1 pep chromosome:GRCh38:7:142786213:142786224:1 gene:ENSG00000282431.1 transcript:ENST00000632684.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158] +GTGG +>ENSP00000418639.1 pep chromosome:GRCh38:14:105865551:105865561:-1 gene:ENSG00000236597.1 transcript:ENST00000439842.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD7-27 description:immunoglobulin heavy diversity 7-27 [Source:HGNC Symbol;Acc:HGNC:5518] +LTG +>ENSP00000420733.1 pep chromosome:GRCh38:14:105881034:105881053:-1 gene:ENSG00000211907.1 transcript:ENST00000390567.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-26 description:immunoglobulin heavy diversity 1-26 [Source:HGNC Symbol;Acc:HGNC:5485] +GIVGAT +>ENSP00000417751.1 pep chromosome:GRCh38:14:105881539:105881556:-1 gene:ENSG00000225825.1 transcript:ENST00000452198.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-25 description:immunoglobulin heavy diversity 6-25 [Source:HGNC Symbol;Acc:HGNC:5516] +GYSSGY +>ENSP00000419139.1 pep chromosome:GRCh38:14:105883903:105883922:-1 gene:ENSG00000211909.1 transcript:ENST00000390569.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-24 description:immunoglobulin heavy diversity 5-24 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5510] +VEMATI +>ENSP00000430248.1 pep chromosome:GRCh38:14:105884870:105884888:-1 gene:ENSG00000227196.1 transcript:ENST00000437320.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-23 description:immunoglobulin heavy diversity 4-23 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5504] +*LRW*L +>ENSP00000429952.1 pep chromosome:GRCh38:14:105886031:105886061:-1 gene:ENSG00000211911.1 transcript:ENST00000390571.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-22 description:immunoglobulin heavy diversity 3-22 [Source:HGNC Symbol;Acc:HGNC:5497] +VLL***WLLL +>ENSP00000429324.1 pep chromosome:GRCh38:14:105888551:105888578:-1 gene:ENSG00000211912.1 transcript:ENST00000390572.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-21 description:immunoglobulin heavy diversity 2-21 [Source:HGNC Symbol;Acc:HGNC:5491] +SILWW*LLF +>ENSP00000418010.1 pep chromosome:GRCh38:14:105891699:105891719:-1 gene:ENSG00000211914.1 transcript:ENST00000390574.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-19 description:immunoglobulin heavy diversity 6-19 [Source:HGNC Symbol;Acc:HGNC:5515] +GYSSGWY +>ENSP00000417555.1 pep chromosome:GRCh38:14:105893542:105893561:-1 gene:ENSG00000211915.1 transcript:ENST00000390575.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-18 description:immunoglobulin heavy diversity 5-18 [Source:HGNC Symbol;Acc:HGNC:5509] +VDTAMV +>ENSP00000428366.1 pep chromosome:GRCh38:14:105895634:105895670:-1 gene:ENSG00000211917.1 transcript:ENST00000390577.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-16 description:immunoglobulin heavy diversity 3-16 [Source:HGNC Symbol;Acc:HGNC:5496] +VL*LRLGELSLY +>ENSP00000431089.1 pep chromosome:GRCh38:14:105894508:105894523:-1 gene:ENSG00000227800.1 transcript:ENST00000431870.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-17 description:immunoglobulin heavy diversity 4-17 [Source:HGNC Symbol;Acc:HGNC:5503] +*LR*L +>ENSP00000420556.1 pep chromosome:GRCh38:14:105891191:105891207:-1 gene:ENSG00000237020.1 transcript:ENST00000450276.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-20 description:immunoglobulin heavy diversity 1-20 [Source:HGNC Symbol;Acc:HGNC:5484] +GITGT +>ENSP00000427969.1 pep chromosome:GRCh38:14:105897957:105897987:-1 gene:ENSG00000211918.1 transcript:ENST00000390578.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-15 description:immunoglobulin heavy diversity 2-15 [Source:HGNC Symbol;Acc:HGNC:5489] +RIL*WW*LLL +>ENSP00000418765.1 pep chromosome:GRCh38:14:105900638:105900654:-1 gene:ENSG00000227108.1 transcript:ENST00000451044.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-14 description:immunoglobulin heavy diversity 1-14 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5483] +GITGT +>ENSP00000419564.1 pep chromosome:GRCh38:14:105901142:105901162:-1 gene:ENSG00000211920.1 transcript:ENST00000390580.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-13 description:immunoglobulin heavy diversity 6-13 [Source:HGNC Symbol;Acc:HGNC:5514] +GYSSSWY +>ENSP00000419283.1 pep chromosome:GRCh38:14:105902649:105902671:-1 gene:ENSG00000211921.1 transcript:ENST00000390581.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-12 description:immunoglobulin heavy diversity 5-12 [Source:HGNC Symbol;Acc:HGNC:5508] +VDIVATI +>ENSP00000430034.1 pep chromosome:GRCh38:14:105903616:105903631:-1 gene:ENSG00000232543.2 transcript:ENST00000431440.2 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-11 description:immunoglobulin heavy diversity 4-11 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5502] +*LQ*L +>ENSP00000419773.1 pep chromosome:GRCh38:14:105904497:105904527:-1 gene:ENSG00000211923.1 transcript:ENST00000390583.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-10 description:immunoglobulin heavy diversity 3-10 [Source:HGNC Symbol;Acc:HGNC:5495] +VLLWFGELL +>ENSP00000488840.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105920273:105920289:-1 gene:ENSG00000282714.1 transcript:ENST00000633210.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-1 description:immunoglobulin heavy diversity 1-1 [Source:HGNC Symbol;Acc:HGNC:5482] +GTTGT +>ENSP00000475053.2 pep chromosome:GRCh38:15:21011451:21011469:-1 gene:ENSG00000270451.1 transcript:ENST00000603693.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4OR15-4B description:immunoglobulin heavy diversity 4/OR15-4B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5507] +*LWC*L +>ENSP00000474222.1 pep chromosome:GRCh38:15:21017800:21017816:-1 gene:ENSG00000270185.1 transcript:ENST00000604838.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1OR15-1B description:immunoglobulin heavy diversity 1/OR15-1B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5488] +GITGT +>ENSP00000473700.1 pep chromosome:GRCh38:15:21010494:21010516:-1 gene:ENSG00000270824.1 transcript:ENST00000604446.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5OR15-5B description:immunoglobulin heavy diversity 5/OR15-5B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5513] +VDIVSTI +>ENSP00000474017.2 pep chromosome:GRCh38:15:21015048:21015078:-1 gene:ENSG00000282268.1 transcript:ENST00000604102.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2OR15-2B description:immunoglobulin heavy diversity 2/OR15-2B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5494] +RIL**YYFLC +>ENSP00000474573.2 pep chromosome:GRCh38:15:21012559:21012589:-1 gene:ENSG00000282089.1 transcript:ENST00000603935.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3OR15-3B description:immunoglobulin heavy diversity 3/OR15-3B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5501] +VL*FLDWLLY +>ENSP00000488695.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105866322:105866332:-1 gene:ENSG00000282455.1 transcript:ENST00000632524.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD7-27 description:immunoglobulin heavy diversity 7-27 [Source:HGNC Symbol;Acc:HGNC:5518] +LTG +>ENSP00000488000.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105881805:105881824:-1 gene:ENSG00000282323.1 transcript:ENST00000633009.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-26 description:immunoglobulin heavy diversity 1-26 [Source:HGNC Symbol;Acc:HGNC:5485] +GIVGAT +>ENSP00000488392.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105882310:105882327:-1 gene:ENSG00000282724.1 transcript:ENST00000634070.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-25 description:immunoglobulin heavy diversity 6-25 [Source:HGNC Symbol;Acc:HGNC:5516] +GYSSGY +>ENSP00000488113.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105884674:105884693:-1 gene:ENSG00000282674.1 transcript:ENST00000632963.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-24 description:immunoglobulin heavy diversity 5-24 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5510] +VEMATI +>ENSP00000488168.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105885641:105885659:-1 gene:ENSG00000282640.1 transcript:ENST00000633030.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-23 description:immunoglobulin heavy diversity 4-23 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5504] +*LRW*L +>ENSP00000488711.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105886802:105886832:-1 gene:ENSG00000282396.1 transcript:ENST00000633765.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-22 description:immunoglobulin heavy diversity 3-22 [Source:HGNC Symbol;Acc:HGNC:5497] +VLL***WLLL +>ENSP00000487599.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105889322:105889349:-1 gene:ENSG00000281984.1 transcript:ENST00000632619.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-21 description:immunoglobulin heavy diversity 2-21 [Source:HGNC Symbol;Acc:HGNC:5491] +SILWW*LLF +>ENSP00000488201.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105894313:105894332:-1 gene:ENSG00000282346.1 transcript:ENST00000631871.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-18 description:immunoglobulin heavy diversity 5-18 [Source:HGNC Symbol;Acc:HGNC:5509] +VDTAMV +>ENSP00000487787.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105896405:105896441:-1 gene:ENSG00000282232.1 transcript:ENST00000633379.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-16 description:immunoglobulin heavy diversity 3-16 [Source:HGNC Symbol;Acc:HGNC:5496] +VL*LRLGELSLY +>ENSP00000488261.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105895279:105895294:-1 gene:ENSG00000282274.1 transcript:ENST00000633010.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-17 description:immunoglobulin heavy diversity 4-17 [Source:HGNC Symbol;Acc:HGNC:5503] +*LR*L +>ENSP00000487789.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105892470:105892490:-1 gene:ENSG00000282487.1 transcript:ENST00000633159.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-19 description:immunoglobulin heavy diversity 6-19 [Source:HGNC Symbol;Acc:HGNC:5515] +GYSSGWY +>ENSP00000487812.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105891962:105891978:-1 gene:ENSG00000282592.1 transcript:ENST00000632968.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-20 description:immunoglobulin heavy diversity 1-20 [Source:HGNC Symbol;Acc:HGNC:5484] +GITGT +>ENSP00000487993.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105898728:105898758:-1 gene:ENSG00000282818.1 transcript:ENST00000632473.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-15 description:immunoglobulin heavy diversity 2-15 [Source:HGNC Symbol;Acc:HGNC:5489] +RIL*WW*LLL +>ENSP00000488522.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105901409:105901425:-1 gene:ENSG00000282736.1 transcript:ENST00000631884.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-14 description:immunoglobulin heavy diversity 1-14 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5483] +GITGT +>ENSP00000488592.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105901913:105901933:-1 gene:ENSG00000282042.1 transcript:ENST00000632859.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-13 description:immunoglobulin heavy diversity 6-13 [Source:HGNC Symbol;Acc:HGNC:5514] +GYSSSWY +>ENSP00000487922.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105903420:105903442:-1 gene:ENSG00000282102.1 transcript:ENST00000631895.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-12 description:immunoglobulin heavy diversity 5-12 [Source:HGNC Symbol;Acc:HGNC:5508] +VDIVATI +>ENSP00000488735.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105904387:105904402:-1 gene:ENSG00000281940.1 transcript:ENST00000634154.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-11 description:immunoglobulin heavy diversity 4-11 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5502] +*LQ*L +>ENSP00000488475.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105905268:105905298:-1 gene:ENSG00000282373.1 transcript:ENST00000632609.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-10 description:immunoglobulin heavy diversity 3-10 [Source:HGNC Symbol;Acc:HGNC:5495] +VLLWFGELL +>ENSP00000487775.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105905452:105905482:-1 gene:ENSG00000281939.1 transcript:ENST00000632911.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-9 description:immunoglobulin heavy diversity 3-9 [Source:HGNC Symbol;Acc:HGNC:5499] +VLRYFDWLL +>ENSP00000488083.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105907982:105908012:-1 gene:ENSG00000282132.1 transcript:ENST00000633504.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-8 description:immunoglobulin heavy diversity 2-8 [Source:HGNC Symbol;Acc:HGNC:5492] +RILY*WCMLY +>ENSP00000488720.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105910678:105910694:-1 gene:ENSG00000282495.1 transcript:ENST00000632304.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-7 description:immunoglobulin heavy diversity 1-7 [Source:HGNC Symbol;Acc:HGNC:5486] +GITGT +>ENSP00000488589.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105911181:105911198:-1 gene:ENSG00000282010.1 transcript:ENST00000632542.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-6 description:immunoglobulin heavy diversity 6-6 [Source:HGNC Symbol;Acc:HGNC:5517] +EYSSSS +>ENSP00000487937.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105913028:105913047:-1 gene:ENSG00000282769.1 transcript:ENST00000633968.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-5 description:immunoglobulin heavy diversity 5-5 [Source:HGNC Symbol;Acc:HGNC:5511] +VDTAMV +>ENSP00000488889.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105913993:105914008:-1 gene:ENSG00000282227.1 transcript:ENST00000634085.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-4 description:immunoglobulin heavy diversity 4-4 [Source:HGNC Symbol;Acc:HGNC:5505] +*LQ*L +>ENSP00000487903.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105915130:105915160:-1 gene:ENSG00000282754.1 transcript:ENST00000633353.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-3 description:immunoglobulin heavy diversity 3-3 [Source:HGNC Symbol;Acc:HGNC:5498] +VLRFLEWLLY +>ENSP00000487604.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105917597:105917627:-1 gene:ENSG00000282578.1 transcript:ENST00000631803.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-2 description:immunoglobulin heavy diversity 2-2 [Source:HGNC Symbol;Acc:HGNC:5490] +RIL**YQLLC +>ENSP00000419583.1 pep chromosome:GRCh38:14:105904681:105904711:-1 gene:ENSG00000211924.1 transcript:ENST00000390584.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-9 description:immunoglobulin heavy diversity 3-9 [Source:HGNC Symbol;Acc:HGNC:5499] +VLRYFDWLL +>ENSP00000428616.1 pep chromosome:GRCh38:14:105907211:105907241:-1 gene:ENSG00000211925.1 transcript:ENST00000390585.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-8 description:immunoglobulin heavy diversity 2-8 [Source:HGNC Symbol;Acc:HGNC:5492] +RILY*WCMLY +>ENSP00000420794.1 pep chromosome:GRCh38:14:105909907:105909923:-1 gene:ENSG00000237197.1 transcript:ENST00000430425.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-7 description:immunoglobulin heavy diversity 1-7 [Source:HGNC Symbol;Acc:HGNC:5486] +GITGT +>ENSP00000418151.1 pep chromosome:GRCh38:14:105910410:105910427:-1 gene:ENSG00000228131.1 transcript:ENST00000454691.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-6 description:immunoglobulin heavy diversity 6-6 [Source:HGNC Symbol;Acc:HGNC:5517] +EYSSSS +>ENSP00000417892.1 pep chromosome:GRCh38:14:105912257:105912276:-1 gene:ENSG00000211928.1 transcript:ENST00000390588.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-5 description:immunoglobulin heavy diversity 5-5 [Source:HGNC Symbol;Acc:HGNC:5511] +VDTAMV +>ENSP00000428393.1 pep chromosome:GRCh38:14:105913222:105913237:-1 gene:ENSG00000233655.1 transcript:ENST00000414852.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-4 description:immunoglobulin heavy diversity 4-4 [Source:HGNC Symbol;Acc:HGNC:5505] +*LQ*L +>ENSP00000420442.1 pep chromosome:GRCh38:14:105914359:105914389:-1 gene:ENSG00000211930.1 transcript:ENST00000390590.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-3 description:immunoglobulin heavy diversity 3-3 [Source:HGNC Symbol;Acc:HGNC:5498] +VLRFLEWLLY +>ENSP00000430788.1 pep chromosome:GRCh38:14:105916826:105916856:-1 gene:ENSG00000211931.1 transcript:ENST00000390591.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-2 description:immunoglobulin heavy diversity 2-2 [Source:HGNC Symbol;Acc:HGNC:5490] +RIL**YQLLC +>ENSP00000418625.1 pep chromosome:GRCh38:14:105919502:105919518:-1 gene:ENSG00000236170.1 transcript:ENST00000454908.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-1 description:immunoglobulin heavy diversity 1-1 [Source:HGNC Symbol;Acc:HGNC:5482] +GTTGT +>ENSP00000473849.1 pep chromosome:GRCh38:15:20003840:20003862:-1 gene:ENSG00000270961.1 transcript:ENST00000604642.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5OR15-5A description:immunoglobulin heavy diversity 5/OR15-5A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5512] +VDIVSTI +>ENSP00000474693.2 pep chromosome:GRCh38:15:20004797:20004815:-1 gene:ENSG00000271317.1 transcript:ENST00000603326.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4OR15-4A description:immunoglobulin heavy diversity 4/OR15-4A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5506] +*LWC*L +>ENSP00000474133.2 pep chromosome:GRCh38:15:20005905:20005935:-1 gene:ENSG00000282520.1 transcript:ENST00000604950.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3OR15-3A description:immunoglobulin heavy diversity 3/OR15-3A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5500] +VL*FLDWLLY +>ENSP00000474065.2 pep chromosome:GRCh38:15:20008402:20008432:-1 gene:ENSG00000282599.1 transcript:ENST00000603077.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2OR15-2A description:immunoglobulin heavy diversity 2/OR15-2A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5493] +RIL**YYFLC +>ENSP00000473787.1 pep chromosome:GRCh38:15:20011153:20011169:-1 gene:ENSG00000271336.1 transcript:ENST00000605284.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1OR15-1A description:immunoglobulin heavy diversity 1/OR15-1A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5487] +GITGT +>ENSP00000487939.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142300947:142301455:1 gene:ENSG00000282568.1 transcript:ENST00000632828.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV2 description:T cell receptor beta variable 2 [Source:HGNC Symbol;Acc:HGNC:12195] +MDTWLVCWAIFSLLKAGLTEPEVTQTPSHQVTQMGQEVILHCVPISNHLYFYWYRQILGQ +KVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE +>ENSP00000488814.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142308565:142309071:1 gene:ENSG00000282624.1 transcript:ENST00000632422.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV3-1 description:T cell receptor beta variable 3-1 [Source:HGNC Symbol;Acc:HGNC:12212] +MGCRLLCCVVFCLLQAGPLDTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKK +FLKIMFSYNNKELIINETVPNRFSPKSPDKAHLNLHINSLELGDSAVYFCASSQ +>ENSP00000488131.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142313207:142313689:1 gene:ENSG00000282014.1 transcript:ENST00000632713.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-1 description:T cell receptor beta variable 4-1 [Source:HGNC Symbol;Acc:HGNC:12215] +MGCRLLCCAVLCLLGAVPIDTEVTQTPKHLVMGMTNKKSLKCEQHMGHRAMYWYKQKAKK +PPELMFVYSYEKLSINESVRSRFSPECPNSSLLNLHLHALQPEDSALYLCASSQ +>ENSP00000488308.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142320696:142321563:1 gene:ENSG00000282803.1 transcript:ENST00000633384.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-1 description:T cell receptor beta variable 5-1 [Source:HGNC Symbol;Acc:HGNC:12218] +MGSRLLCWVLLCLLGAGPVKAGVTQTPRYLIKTRGQQVTLSCSPISGHRSVSWYQQTPGQ +GLQFLFEYFSETQRNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYLCASSL +>ENSP00000488756.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142328321:142328810:1 gene:ENSG00000281970.1 transcript:ENST00000631557.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-1 description:T cell receptor beta variable 6-1 [Source:HGNC Symbol;Acc:HGNC:12226] +MSIGLLCCVAFSLLWASPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHNSMYWYRQDPGM +GLRLIYYSASEGTTDKGEVPNGYNVSRLNKREFSLRLESAAPSQTSVYFCASSE +>ENSP00000488287.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142332206:142332727:1 gene:ENSG00000282225.1 transcript:ENST00000632308.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-1 description:T cell receptor beta variable 7-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12235] +MGTRLLCWAAICLLGADHTGAGVSQSLRHKVAKKGKDVALRYDPISGHNALYWYRQSLGQ +GLEFPIYFQGKDAADKSGLPRDRFSAQRSEGSISTLKFQRTQQGDLAVYLCASSS +>ENSP00000487667.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142345452:142346016:1 gene:ENSG00000282285.1 transcript:ENST00000632512.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-2 description:T cell receptor beta variable 4-2 [Source:HGNC Symbol;Acc:HGNC:12216] +MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKK +PLELMFVYNFKEQTENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ +>ENSP00000488603.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142349215:142349695:1 gene:ENSG00000282719.1 transcript:ENST00000632016.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-2 description:T cell receptor beta variable 6-2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12227] +MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGM +GLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY +>ENSP00000488576.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142362570:142363134:1 gene:ENSG00000282543.1 transcript:ENST00000631427.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC234635.3 description:T cell receptor beta variable 4-3 [Source:UniProtKB/Swiss-Prot;Acc:A0A589] +MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKK +PLELMFVYSLEERVENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ +>ENSP00000488127.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142370836:142371348:1 gene:ENSG00000282353.1 transcript:ENST00000632148.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC234635.1 description:T cell receptor beta variable 6-3 [Source:UniProtKB/Swiss-Prot;Acc:P0DPF7] +MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGM +GLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY +>ENSP00000488152.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142374511:142375050:1 gene:ENSG00000282506.1 transcript:ENST00000631392.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-2 description:T cell receptor beta variable 7-2 [Source:HGNC Symbol;Acc:HGNC:12236] +MGTRLLFWVAFCLLGADHTGAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQ +GLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL +>ENSP00000487798.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142402503:142402958:1 gene:ENSG00000282240.1 transcript:ENST00000633472.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-4 description:T cell receptor beta variable 6-4 [Source:HGNC Symbol;Acc:HGNC:12229] +MRIRLLCCVAFSLLWAGPVIAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGL +GLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD +>ENSP00000488108.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142406045:142406551:1 gene:ENSG00000282203.1 transcript:ENST00000631882.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-3 description:T cell receptor beta variable 7-3 [Source:HGNC Symbol;Acc:HGNC:12237] +MGTRLLCWAALCLLGADHTGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQ +GPEFLIYFQGTGAADDSGLPNDRFFAVRPEGSVSTLKIQRTERGDSAVYLCASSL +>ENSP00000488267.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142410913:142411379:1 gene:ENSG00000282148.1 transcript:ENST00000634123.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-3 description:T cell receptor beta variable 5-3 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12220] +MGPGLLCWELLYLLGAGPVEAGVTQSPTHLIKTRGQQVTLRCSPISGHSSVSWYQQAPGQ +GPQFIFEYANELRRSEGNFPNRFSGRQFHDCCSEMNVSALELGDSALYLCARSL +>ENSP00000488515.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142413602:142414123:1 gene:ENSG00000282204.1 transcript:ENST00000633328.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV9 description:T cell receptor beta variable 9 [Source:HGNC Symbol;Acc:HGNC:12246] +MGFRLLCCVAFCLLGAGPVDSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQ +GLQFLIQYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV +>ENSP00000488035.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142421573:142422090:1 gene:ENSG00000282618.1 transcript:ENST00000632248.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-1 description:T cell receptor beta variable 10-1(gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12177] +MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGH +GLRLIHYSYGVHDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE +>ENSP00000488521.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142429379:142429843:1 gene:ENSG00000282711.1 transcript:ENST00000634176.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-1 description:T cell receptor beta variable 11-1 [Source:HGNC Symbol;Acc:HGNC:12180] +MSTRLLCWMALCLLGAELSEAEVAQSPRYKITEKSQAVAFWCDPISGHATLYWYRQILGQ +GPELLVQFQDESVVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAMYLCASSL +>ENSP00000488043.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142446652:142447152:1 gene:ENSG00000282007.1 transcript:ENST00000633575.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-2 description:T cell receptor beta variable 10-2 [Source:HGNC Symbol;Acc:HGNC:12178] +MGTRLFFYVALCLLWAGHRDAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGH +GLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE +>ENSP00000488123.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142472647:142473148:1 gene:ENSG00000277110.3 transcript:ENST00000633072.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-5 description:T cell receptor beta variable 6-5 [Source:HGNC Symbol;Acc:HGNC:12230] +MSIGLLCCAALSLLWAGPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGM +GLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSY +>ENSP00000488823.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142476873:142477334:1 gene:ENSG00000282756.1 transcript:ENST00000633313.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-4 description:T cell receptor beta variable 7-4 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12238] +MGTRLLCWVVLGFLGTDHTGAGVSQSPRYKVAKRGRDVALRCDSISGHVTLYWYRQTLGQ +GSEVLTYSQSDAQRDKSGRPSGRFSAERPERSVSTLKIQCTEQGDSAVYLCASSL +>ENSP00000488374.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142484618:142485283:1 gene:ENSG00000282466.1 transcript:ENST00000633696.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-4 description:T cell receptor beta variable 5-4 [Source:HGNC Symbol;Acc:HGNC:12221] +MGPGLLCWALLCLLGAGSVDAGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQ +GPQFIFQYYREEENGRGNFPPRFSGLQFPNDSSELNVNALELDDSALYLCASSL +>ENSP00000488741.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142491254:142491732:1 gene:ENSG00000282459.1 transcript:ENST00000633963.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-6 description:T cell receptor beta variable 6-6 [Source:HGNC Symbol;Acc:HGNC:12231] +MSISLLCCAAFPLLWAGPVNAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGM +GLKLIYYSVGAGITDKGEVPNGYNVSRSTTEYFPLRLELAAPSQTSVYFCASSY +>ENSP00000488241.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142504264:142504735:1 gene:ENSG00000282577.1 transcript:ENST00000632187.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-5 description:T cell receptor beta variable 5-5 [Source:HGNC Symbol;Acc:HGNC:12222] +MGPGLLCWVLLCLLVAGPVDAGVTQSPTHLIKTRGQQVTLRCSPISGHKSVSWYQQVLGQ +GPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASSL +>ENSP00000488335.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142509537:142510011:1 gene:ENSG00000282470.1 transcript:ENST00000631511.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-7 description:T cell receptor beta variable 6-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12232] +MSLGLLCCVAFSLLWAGPMNAGVTQTPKFHVLKTGQSMTLLCAQDMNHEYMYRYRQDPGK +GLRLIYYSVAAALTDKGEVPNGYNVSRSNTEDFPLKLESAAPSQTSVYFCASSY +>ENSP00000488212.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142513848:142514385:1 gene:ENSG00000282704.1 transcript:ENST00000633265.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-6 description:T cell receptor beta variable 7-6 [Source:HGNC Symbol;Acc:HGNC:12240] +MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQ +GPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSVSTLTIQRTEQRDSAMYRCASSL +>ENSP00000487850.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142521745:142522251:1 gene:ENSG00000282098.1 transcript:ENST00000632216.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-6 description:T cell receptor beta variable 5-6 [Source:HGNC Symbol;Acc:HGNC:12223] +MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKSGHDTVSWYQQALGQ +GPQFIFQYYEEEERQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYLCASSL +>ENSP00000488870.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142529032:142529526:1 gene:ENSG00000282134.1 transcript:ENST00000632425.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-8 description:T cell receptor beta variable 6-8 [Source:HGNC Symbol;Acc:HGNC:12233] +MSLGLLCCAAFSLLWAGPVNAGVTQTPKFHILKTGQSMTLQCAQDMNHGYMSWYRQDPGM +GLRLIYYSAAAGTTDKEVPNGYNVSRLNTEDFPLRLVSAAPSRTSVYLCASSY +>ENSP00000488424.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142533342:142533843:1 gene:ENSG00000282179.1 transcript:ENST00000631548.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-7 description:T cell receptor beta variable 7-7 [Source:HGNC Symbol;Acc:HGNC:12241] +MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVTLRCDPISSHATLYWYQQALGQ +GPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL +>ENSP00000488478.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142541804:142542270:1 gene:ENSG00000282748.1 transcript:ENST00000633790.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-7 description:T cell receptor beta variable 5-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12224] +MGPGLLCWVLLCPLGEGPVDAGVTQSPTHLIKTRGQHVTLRCSPISGHTSVSSYQQALGQ +GPQFIFQYYEKEERGRGNFPDQFSGHQFPNYSSELNVNALLLGDSALYLCASSL +>ENSP00000488280.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142549110:142549542:1 gene:ENSG00000282610.1 transcript:ENST00000634093.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC233282.2 description:T cell receptor beta variable 6-9 [Source:UniProtKB/Swiss-Prot;Acc:A0A0J9YX75] +MSIGLLCCVAFSLLWAGPVNAGVTQTPKFHILKTGQSMTLQCAQDMNHGYLSWYRQDPGM +GLRRIHYSVAAGITDKGEVPDGYNVSRSNTEDFPLRLESAAPSQTSVYFCASSY +>ENSP00000488190.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142553725:142554208:1 gene:ENSG00000282040.1 transcript:ENST00000632560.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC233282.1 description:T cell receptor beta variable 7-8 [Source:UniProtKB/Swiss-Prot;Acc:A0A1B0GX51] +MGTRLLCWVVLGFLGTDHTGAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQ +GPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQQEDSAVYLCASSL +>ENSP00000488017.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142561449:142562408:1 gene:ENSG00000282054.1 transcript:ENST00000631639.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC229888.1 description:T cell receptor beta variable 5-8 [Source:UniProtKB/Swiss-Prot;Acc:A0A5A2] +MGPRLLFWALLCLLGTGPVEAGVTQSPTHLIKTRGQQATLRCSPISGHTSVYWYQQALGL +GLQFLLWYDEGEERNRGNFPPRFSGRQFPNYSSELNVNALELEDSALYLCASSL +>ENSP00000487884.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142571143:142571615:1 gene:ENSG00000281943.1 transcript:ENST00000632021.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-9 description:T cell receptor beta variable 7-9 [Source:HGNC Symbol;Acc:HGNC:12243] +MGTSLLCWMALCLLGADHADTGVSQDPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQ +GPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL +>ENSP00000488778.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142577660:142578143:1 gene:ENSG00000282407.1 transcript:ENST00000633796.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV13 description:T cell receptor beta variable 13 [Source:HGNC Symbol;Acc:HGNC:12188] +MLSPDLPDSAWNTRLLCRVMLCLLGAGSVAAGVIQSPRHLIKEKRETATLKCYPIPRHDT +VYWYQQGPGQDPQFLISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFC +ASSL +>ENSP00000487891.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142586067:142586516:1 gene:ENSG00000282340.1 transcript:ENST00000631471.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-3 description:T cell receptor beta variable 10-3 [Source:HGNC Symbol;Acc:HGNC:12179] +MGTRLFFYVALCLLWTGHMDAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGH +GLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE +>ENSP00000487749.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142596665:142597147:1 gene:ENSG00000282242.1 transcript:ENST00000634111.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-3 description:T cell receptor beta variable 11-3 [Source:HGNC Symbol;Acc:HGNC:12182] +MGTRLLCWVAFCLLVEELIEAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYLQNLGQ +GPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL +>ENSP00000487964.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142602235:142602743:1 gene:ENSG00000282208.1 transcript:ENST00000633292.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-3 description:T cell receptor beta variable 12-3 [Source:HGNC Symbol;Acc:HGNC:12185] +MDSWTFCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMR +GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL +>ENSP00000488855.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142605549:142606054:1 gene:ENSG00000282354.1 transcript:ENST00000631824.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-4 description:T cell receptor beta variable 12-4 [Source:HGNC Symbol;Acc:HGNC:12186] +MDSWTLCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMR +GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL +>ENSP00000488633.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142622755:142623265:1 gene:ENSG00000282605.1 transcript:ENST00000632829.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-5 description:T cell receptor beta variable 12-5 [Source:HGNC Symbol;Acc:HGNC:12187] +MATRLLCCVVLCLLGEELIDARVTQTPRDKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQ +GLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGL +>ENSP00000488641.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142629704:142630195:1 gene:ENSG00000282252.1 transcript:ENST00000632432.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV14 description:T cell receptor beta variable 14 [Source:HGNC Symbol;Acc:HGNC:12189] +MVSRLLSLVSLCLLGAKHIEAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGK +EIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQ +>ENSP00000488551.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142634764:142635309:1 gene:ENSG00000282497.1 transcript:ENST00000631835.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV15 description:T cell receptor beta variable 15 [Source:HGNC Symbol;Acc:HGNC:12190] +MGPGLLHWMALCLLGTGHGDAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQ +APKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYLCATSR +>ENSP00000487913.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142639852:142640305:1 gene:ENSG00000282415.1 transcript:ENST00000633244.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV16 description:T cell receptor beta variable 16 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12191] +MSPIFTCITILCLLAAGSPGEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKN +EFKFLISFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASSQ +>ENSP00000488775.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142643462:142644194:1 gene:ENSG00000282483.1 transcript:ENST00000631663.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV17 description:T cell receptor beta variable 17 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12192] +MDIWLLCWVTLCLLAAGHSEPGVSQTPRHKVTNMGQEVILRCDPSSGHMFVHWYRQNLRQ +EMKLLISFQYQNIAVDSGMPKERFTAERPNGTSSTLKIHPAEPRDSAVYLYSSG +>ENSP00000488621.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142657499:142658198:1 gene:ENSG00000282771.1 transcript:ENST00000631559.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV18 description:T cell receptor beta variable 18 [Source:HGNC Symbol;Acc:HGNC:12193] +MDTRVLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEE +GLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSP +>ENSP00000487807.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142660632:142661315:1 gene:ENSG00000282621.1 transcript:ENST00000632638.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV19 description:T cell receptor beta variable 19 [Source:HGNC Symbol;Acc:HGNC:12194] +MSNQVLCCVVLCFLGANTVDGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQ +GLRLIYYSQIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI +>ENSP00000488099.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142668431:142669181:1 gene:ENSG00000282064.1 transcript:ENST00000633466.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV20-1 description:T cell receptor beta variable 20-1 [Source:HGNC Symbol;Acc:HGNC:12196] +MLLLLLLLGPGSGLGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML +MATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR +>ENSP00000487718.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142687733:142688229:1 gene:ENSG00000282449.1 transcript:ENST00000633842.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV23-1 description:T cell receptor beta variable 23-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12201] +MGTRLLGCAALCLLAADSFHAKVTQTPGHLVKGKGQKTKMDCTPEKGHTFVYWYQQNQNK +EFMLLISFQNEQVLQETEMHKKRFSSQCPKNAPCSLAILSSEPGDTALYLCASSQ +>ENSP00000488057.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142697038:142697550:1 gene:ENSG00000282730.1 transcript:ENST00000633092.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV24-1 description:T cell receptor beta variable 24-1 [Source:HGNC Symbol;Acc:HGNC:12203] +MASLLFFCGAFYLLGTGSMDADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGL +GLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL +>ENSP00000479511.2 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142711413:142711917:1 gene:ENSG00000281963.1 transcript:ENST00000610439.4 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV25-1 description:T cell receptor beta variable 25-1 [Source:HGNC Symbol;Acc:HGNC:12205] +MTIRLLCYMGFYFLGAGLMEADIYQTPRYLVIGTGKKITLECSQTMGHDKMYWYQQDPGM +ELHLIHYSYGVNSTEKGDLSSESTVSRIRTEHFPLTLESARPSHTSQYLCASSE +>ENSP00000488274.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142756048:142756563:1 gene:ENSG00000282234.1 transcript:ENST00000633283.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV27 description:T cell receptor beta variable 27 [Source:HGNC Symbol;Acc:HGNC:12208] +MGPQLLGYVVLCLLGAGPLEAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGL +GLRQIYYSMNVEVTDKGDVPEGYKVSRKEKRNFPLILESPSPNQTSLYFCASSL +>ENSP00000480928.2 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142761362:142761862:1 gene:ENSG00000282812.1 transcript:ENST00000619125.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV28 description:T cell receptor beta variable 28 [Source:HGNC Symbol;Acc:HGNC:12209] +MGIRLLCRVAFCFLAVGLVDVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGL +GLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASSL +>ENSP00000488861.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142780919:142781607:1 gene:ENSG00000282628.1 transcript:ENST00000634198.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV29-1 description:T cell receptor beta variable 29-1 [Source:HGNC Symbol;Acc:HGNC:12210] +MLSLLLLLLGLGSVFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTL +IATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVE +>ENSP00000487814.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142873679:142874492:-1 gene:ENSG00000282297.1 transcript:ENST00000631690.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV30 description:T cell receptor beta variable 30 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12214] +MLCSLLALLLGTFFGVRSQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRG +LQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS +>ENSP00000368747.3 pep chromosome:GRCh38:9:33617762:33618506:1 gene:ENSG00000205274.3 transcript:ENST00000379435.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV20OR9-2 description:T cell receptor beta variable 20/OR9-2 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12197] +METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVICKSGTSVNIECRSLD +FQATTMFWYRQLRKQSLMLMATSNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPE +DSSFYICSAR +>ENSP00000374867.2 pep chromosome:GRCh38:7:38349355:38350022:-1 gene:ENSG00000211697.4 transcript:ENST00000390344.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV5 description:T cell receptor gamma variable 5 [Source:HGNC Symbol;Acc:HGNC:12290] +MRWALLVLLAFLSPASQKSSNLEGGTKSVTRPTRSSAEITCDLTVINAFYIHWYLHQEGK +APQRLLYYDVSNSKDVLESGLSPGKYYTHTPRRWSWILILRNLIENDSGVYYCATWDR +>ENSP00000404928.2 pep chromosome:GRCh38:7:38362864:38363518:-1 gene:ENSG00000233306.2 transcript:ENST00000426402.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV2 description:T cell receptor gamma variable 2 [Source:HGNC Symbol;Acc:HGNC:12287] +MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSNGYIHWYLHQEGK +APQRLQYYDSYNSKVVLESGVSPGKYYTYASTRNNLRLILRNLIENDFGVYYCATWDG +>ENSP00000374864.2 pep chromosome:GRCh38:7:38299811:38300322:-1 gene:ENSG00000211694.2 transcript:ENST00000390341.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV10 description:T cell receptor gamma variable 10 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12285] +MFIGNSPLLLTVGLGLSKVEQFQLSISTEVKKSIDIPCKISSTRFETDVIHWYRQKPNQA +LEHLIYIVSTKSAARRSMGKTSNKVEARKNSQTLTSILTIKSVEKEDMAVYYCAAWD +>ENSP00000374866.2 pep chromosome:GRCh38:7:38330343:38330935:-1 gene:ENSG00000211696.2 transcript:ENST00000390343.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV8 description:T cell receptor gamma variable 8 [Source:HGNC Symbol;Acc:HGNC:12294] +MLLALALLLAFLPPASQKSSNLEGRTKSVTRPTGSSAVITCDLPVENAVYTHWYLHQEGK +APQRLLYYDSYNSRVVLESGISREKYHTYASTGKSLKFILENLIERDSGVYYCATWDR +>ENSP00000374869.2 pep chromosome:GRCh38:7:38358512:38359162:-1 gene:ENSG00000211699.2 transcript:ENST00000390346.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV3 description:T cell receptor gamma variable 3 [Source:HGNC Symbol;Acc:HGNC:12288] +MRWALLVLLAFLSPASQKSSNLEGRTKSVTRQTGSSAEITCDLTVTNTFYIHWYLHQEGK +APQRLLYYDVSTARDVLESGLSPGKYYTHTPRRWSWILRLQNLIENDSGVYYCATWDR +>ENSP00000391561.2 pep chromosome:GRCh38:7:38317017:38318861:-1 gene:ENSG00000211695.2 transcript:ENST00000444775.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV9 description:T cell receptor gamma variable 9 [Source:HGNC Symbol;Acc:HGNC:12295] +MLSLLHTSTLAVLGALCVYGAGHLEQPQISSTKTLSKTARLECVVSGITISATSVYWYRE +RPGEVIQFLVSISYDGTVRKESGIPSGKFEVDRIPETSTSTLTIHNVEKQDIATYYCALW +EV +>ENSP00000374868.2 pep chromosome:GRCh38:7:38353715:38354517:-1 gene:ENSG00000211698.2 transcript:ENST00000390345.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV4 description:T cell receptor gamma variable 4 [Source:HGNC Symbol;Acc:HGNC:12289] +MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSTGYIHWYLHQEGK +APQRLLYYDSYTSSVVLESGISPGKYDTYGSTRKNLRMILRNLIENDSGVYYCATWDG +>ENSP00000374871.2 pep chromosome:GRCh38:7:38367586:38368169:-1 gene:ENSG00000211701.2 transcript:ENST00000390348.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV1 description:T cell receptor gamma variable 1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12284] +MRWALAVLLAFLSPASQISSNLEGRTKSVTRLTGSSAEITCDLPGASTLYIHWYLHQEGK +APQCLLYYEPYYSRVVLESGITPGKYDTGSTRSNWNLRLQNLIKNDSGFYYCATWDR +>ENSP00000374863.2 pep chromosome:GRCh38:7:38291616:38292078:-1 gene:ENSG00000211693.2 transcript:ENST00000390340.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV11 description:T cell receptor gamma variable 11 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12286] +LGQLEQPEISISRPANKSAHISWKASIQGFSSKIIHWYWQKPNKGLEYLLHVFLTISAQD +CSGGKTKKLEVSKNAHTSTSTLKIKFLEKEDEVVYHCACWIRH +>ENSP00000446309.1 pep chromosome:GRCh38:14:21621838:21622567:1 gene:ENSG00000255569.1 transcript:ENST00000542354.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV1-1 description:T cell receptor alpha variable 1-1 [Source:HGNC Symbol;Acc:HGNC:12101] +MWGAFLLYVSMKMGGTAGQSLEQPSEVTAVEGAIVQINCTYQTSGFYGLSWYQQHDGGAP +TFLSYNALDGLEETGRFSSFLSRSDSYGYLLLQELQMKDSASYFCAVR +>ENSP00000439668.1 pep chromosome:GRCh38:14:21642889:21643578:1 gene:ENSG00000256553.1 transcript:ENST00000390423.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV1-2 description:T cell receptor alpha variable 1-2 [Source:HGNC Symbol;Acc:HGNC:12102] +MWGVFLLYVSMKMGGTTGQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAP +TFLSYNVLDGLEEKGRFSSFLSRSKGYSYLLLKELQMKDSASYLCA +>ENSP00000438195.1 pep chromosome:GRCh38:14:21712321:21712843:1 gene:ENSG00000211776.2 transcript:ENST00000390424.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV2 description:T cell receptor alpha variable 2 [Source:HGNC Symbol;Acc:HGNC:12116] +MALQSTLGAVWLGLLLNSLWKVAESKDQVFQPSTVASSEGAVVEIFCNHSVSNAYNFFWY +LHFPGCAPRLLVKGSKPSQQGRYNMTYERFSSSLLILQVREADAAVYYCAVE +>ENSP00000444955.1 pep chromosome:GRCh38:14:21723713:21724321:1 gene:ENSG00000211777.2 transcript:ENST00000390425.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV3 description:T cell receptor alpha variable 3 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12128] +MASAPISMLAMLFTLSGLRAQSVAQPEDQVNVAEGNPLTVKCTYSVSGNPYLFWYVQYPN +RGLQFLLKYITGDNLVKGSYGFEAEFNKSQTSFHLKKPSALVSDSALYFCAVRD +>ENSP00000451535.1 pep chromosome:GRCh38:14:21736152:21736982:1 gene:ENSG00000211778.2 transcript:ENST00000390426.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV4 description:T cell receptor alpha variable 4 [Source:HGNC Symbol;Acc:HGNC:12140] +MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIATNDYITWYQQFPSQG +PRFIIQGYKTKVTNEVASLFIPADRKSSTLSLPRVSLSDTAVYYCLVGD +>ENSP00000446355.1 pep chromosome:GRCh38:14:21749178:21749705:1 gene:ENSG00000211779.3 transcript:ENST00000390427.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV5 description:T cell receptor alpha variable 5 [Source:HGNC Symbol;Acc:HGNC:12143] +MKTFAGFSFLFLWLQLDCMSRGEDVEQSLFLSVREGDSSVINCTYTDSSSTYLYWYKQEP +GAGLQLLTYIFSNMDMKQDQRLTVLLNKKDKHLSLRIADTQTGDSAIYFCAES +>ENSP00000438290.1 pep chromosome:GRCh38:14:21768489:21769080:1 gene:ENSG00000211780.3 transcript:ENST00000390428.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV6 description:T cell receptor alpha variable 6 [Source:HGNC Symbol;Acc:HGNC:12144] +MAFWLRSLGLHFRPHLGRRMESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATL +TCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQ +PADSATYLCALD +>ENSP00000443297.1 pep chromosome:GRCh38:14:21782993:21783503:1 gene:ENSG00000211781.3 transcript:ENST00000390429.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV7 description:T cell receptor alpha variable 7 [Source:HGNC Symbol;Acc:HGNC:12145] +MEKMRRPVLIIFCLCLGWANGENQVEHSPHFLGPQQGDVASMSCTYSVSRFNNLQWYRQN +TGMGPKHLLSMYSAGYEKQKGRLNATLLKNGSSLYITAVQPEDSATYFCAVD +>ENSP00000443059.1 pep chromosome:GRCh38:14:21797287:21797886:1 gene:ENSG00000211782.2 transcript:ENST00000390430.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-1 description:T cell receptor alpha variable 8-1 [Source:HGNC Symbol;Acc:HGNC:12146] +MLLLLIPVLGMIFALRDARAQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPG +QHLQLLLKYFSGDPLVKGIKGFEAEFIKSKFSFNLRKPSVQWSDTAEYFCAVN +>ENSP00000438446.1 pep chromosome:GRCh38:14:21811502:21811977:1 gene:ENSG00000211783.3 transcript:ENST00000390431.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV9-1 description:T cell receptor alpha variable 9-1 [Source:HGNC Symbol;Acc:HGNC:12153] +MNSSPGPAIALFLMFGGINGDSVVQTEGQVLPSEGDSLIVNCSYETTQYPSLFWYVQYPG +EGPQLHLKAMKANDKGRNKGFEAMYRKETTSFHLEKDSVQESDSAVYFCALS +>ENSP00000440313.1 pep chromosome:GRCh38:14:21825472:21826075:1 gene:ENSG00000211784.2 transcript:ENST00000390432.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV10 description:T cell receptor alpha variable 10 [Source:HGNC Symbol;Acc:HGNC:12103] +MKKHLTTFLVILWLYFYRGNGKNQVEQSPQSLIILEGKNCTLQCNYTVSPFSNLRWYKQD +TGRGPVSLTIMTFSENTKSNGRYTATLDADTKQSSLHITASQLSDSASYICVVS +>ENSP00000445405.1 pep chromosome:GRCh38:14:21841240:21841774:1 gene:ENSG00000211785.1 transcript:ENST00000390433.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV12-1 description:T cell receptor alpha variable 12-1 [Source:HGNC Symbol;Acc:HGNC:12105] +MISLRVLLVILWLQLSWVWSQRKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQD +CRKEPKLLMSVYSSGNEDGRFTAQLNRASQYISLLIRDSKLSDSATYLCVVN +>ENSP00000439323.1 pep chromosome:GRCh38:14:21846537:21847221:1 gene:ENSG00000211786.3 transcript:ENST00000390434.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-2 description:T cell receptor alpha variable 8-2 [Source:HGNC Symbol;Acc:HGNC:12147] +MLLLLVPVLEVIFTLGGTRAQSVTQLDSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPN +KGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVVS +>ENSP00000440087.1 pep chromosome:GRCh38:14:21852558:21853006:1 gene:ENSG00000211787.1 transcript:ENST00000390435.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-3 description:T cell receptor alpha variable 8-3 [Source:HGNC Symbol;Acc:HGNC:12148] +MLLELIPLLGIHFVLRTARAQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPG +QGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVG +>ENSP00000441696.1 pep chromosome:GRCh38:14:21868839:21869365:1 gene:ENSG00000211788.2 transcript:ENST00000390436.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV13-1 description:T cell receptor alpha variable 13-1 [Source:HGNC Symbol;Acc:HGNC:12108] +MTSIRAVFIFLWLQLDLVNGENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELG +KGPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLHITETQPEDSAVYFCAAS +>ENSP00000437362.1 pep chromosome:GRCh38:14:21887857:21888502:1 gene:ENSG00000211789.2 transcript:ENST00000390437.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV12-2 description:T cell receptor alpha variable 12-2 [Source:HGNC Symbol;Acc:HGNC:12106] +MKSLRVLLVILWLQLSWVWSQQKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQY +SGKSPELIMFIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVN +>ENSP00000445942.1 pep chromosome:GRCh38:14:21894433:21895030:1 gene:ENSG00000211790.2 transcript:ENST00000390438.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-4 description:T cell receptor alpha variable 8-4 [Source:HGNC Symbol;Acc:HGNC:12149] +MLLLLVPVLEVIFTLGGTRAQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPN +QGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS +>ENSP00000438480.1 pep chromosome:GRCh38:14:21918188:21918756:1 gene:ENSG00000211791.2 transcript:ENST00000390439.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV13-2 description:T cell receptor alpha variable 13-2 [Source:HGNC Symbol;Acc:HGNC:12109] +MAGIRALFMYLWLQLDWVSRGESVGLHLPTLSVQEGDNSIINCAYSNSASDYFIWYKQES +GKGPQFIIDIRSNMDKRQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYFCAEN +>ENSP00000446015.1 pep chromosome:GRCh38:14:21924063:21924651:1 gene:ENSG00000211792.2 transcript:ENST00000390440.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV14DV4 description:T cell receptor alpha variable 14/delta variable 4 [Source:HGNC Symbol;Acc:HGNC:12110] +MSLSSLLKVVTASLWLGPGIAQKITQTQPGMFVQEKEAVTLDCTYDTSDPSYGLFWYKQP +SSGEMIFLIYQGSYDQQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAMRE +>ENSP00000452011.1 pep chromosome:GRCh38:14:21941128:21941657:1 gene:ENSG00000211793.2 transcript:ENST00000390441.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV9-2 description:T cell receptor alpha variable 9-2 [Source:HGNC Symbol;Acc:HGNC:12154] +MNYSPGLVSLILLLLGRTRGDSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPG +EGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCALS +>ENSP00000451822.1 pep chromosome:GRCh38:14:21965451:21966061:1 gene:ENSG00000211794.3 transcript:ENST00000390442.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV12-3 description:T cell receptor alpha variable 12-3 [Source:HGNC Symbol;Acc:HGNC:12107] +MMKSLRVLLVILWLQLSWVWSQQKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQ +YSRKGPELLMYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMS +>ENSP00000450505.1 pep chromosome:GRCh38:14:21978459:21979120:1 gene:ENSG00000211795.3 transcript:ENST00000390443.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-6 description:T cell receptor alpha variable 8-6 [Source:HGNC Symbol;Acc:HGNC:12151] +MLLLLVPAFQVIFTLGGTRAQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPN +QGLQLLLKYLSGSTLVESINGFEAEFNKSQTSFHLRKPSVHISDTAEYFCAVS +>ENSP00000451359.1 pep chromosome:GRCh38:14:21990496:21990938:1 gene:ENSG00000211796.1 transcript:ENST00000390444.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV16 description:T cell receptor alpha variable 16 [Source:HGNC Symbol;Acc:HGNC:12112] +MKPTLISVLVIIFILRGTRAQRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSR +QRLQLLLRHISRESIKGFTADLNKGETSFHLKKPFAQEEDSAMYYCALS +>ENSP00000452087.1 pep chromosome:GRCh38:14:21997539:21998168:1 gene:ENSG00000211797.2 transcript:ENST00000390445.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV17 description:T cell receptor alpha variable 17 [Source:HGNC Symbol;Acc:HGNC:12113] +METLLGVSLVILWLQLARVNSQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSG +RGLVHLILIRSNEREKHSGRLRVTLDTSKKSSSLLITASRAADTASYFCATD +>ENSP00000451574.1 pep chromosome:GRCh38:14:22003106:22003673:1 gene:ENSG00000211798.3 transcript:ENST00000390446.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV18 description:T cell receptor alpha variable 18 [Source:HGNC Symbol;Acc:HGNC:12114] +MLSASCSGLVILLIFRRTSGDSVTQTEGPVTLPERAALTLNCTYQSSYSTFLFWYVQYLN +KEPELLLKSSENQETDSRGFQASPIKSDSSFHLEKPSVQLSDSAVYYCALR +>ENSP00000452148.1 pep chromosome:GRCh38:14:22007512:22008181:1 gene:ENSG00000211799.3 transcript:ENST00000390447.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV19 description:T cell receptor alpha variable 19 [Source:HGNC Symbol;Acc:HGNC:12115] +MLTASLLRAVIASICVVSSMAQKVTQAQTEISVVEKEDVTLDCVYETRDTTYYLFWYKQP +PSGELVFLIRRNSFDEQNEISGRYSWNFQKSTSSFNFTITASQVVDSAVYFCALSE +>ENSP00000452067.1 pep chromosome:GRCh38:14:22040594:22041153:1 gene:ENSG00000211800.3 transcript:ENST00000390448.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV20 description:T cell receptor alpha variable 20 [Source:HGNC Symbol;Acc:HGNC:12117] +MEKMLECAFIVLWLQLGWLSGEDQVTQSPEALRLQEGESSSLNCSYTVSGLRGLFWYRQD +PGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCAVQ +>ENSP00000452526.1 pep chromosome:GRCh38:14:22052514:22053056:1 gene:ENSG00000211801.3 transcript:ENST00000390449.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV21 description:T cell receptor alpha variable 21 [Source:HGNC Symbol;Acc:HGNC:12118] +METLLGLLILWLQLQWVSSKQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPG +KGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCAVR +>ENSP00000452420.1 pep chromosome:GRCh38:14:22070557:22071208:1 gene:ENSG00000211802.3 transcript:ENST00000390450.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV22 description:T cell receptor alpha variable 22 [Source:HGNC Symbol;Acc:HGNC:12119] +MKRILGALLGLLSAQVCCVRGIQVEQSPPDLILQEGANSTLRCNFSDSVNNLQWFHQNPW +GQLINLFYIPSGTKQNGRLSATTVATERYSLLYISSSQTTDSGVYFCAVE +>ENSP00000451203.1 pep chromosome:GRCh38:14:22086407:22086961:1 gene:ENSG00000211803.2 transcript:ENST00000390451.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV23DV6 description:T cell receptor alpha variable 23/delta variable 6 [Source:HGNC Symbol;Acc:HGNC:12120] +MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGISIINCAYENTAFDY +FPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSSHIMDSQPGDSATYFCAA +S +>ENSP00000452111.1 pep chromosome:GRCh38:14:22096032:22096619:1 gene:ENSG00000211804.3 transcript:ENST00000390452.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRDV1 description:T cell receptor delta variable 1 [Source:HGNC Symbol;Acc:HGNC:12262] +MLFSSLLCVFVAFSYSGSSVAQKVTQAQSSVSMPVRKAVTLNCLYETSWWSYYIFWYKQL +PSKEMIFLIRQGSDEQNAKSGRYSVNFKKAAKSVALTISALQLEDSAKYFCALGE +>ENSP00000484940.1 pep chromosome:GRCh38:14:22096507:22096608:1 gene:ENSG00000211804.3 transcript:ENST00000621643.1 gene_biotype:TR_V_gene transcript_biotype:protein_coding gene_symbol:TRDV1 description:T cell receptor delta variable 1 [Source:HGNC Symbol;Acc:HGNC:12262] +KSGRYSVNFKKAAKSVALTISALQLEDSAKYFCA +>ENSP00000451837.1 pep chromosome:GRCh38:14:22105343:22105846:1 gene:ENSG00000211805.1 transcript:ENST00000390453.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV24 description:T cell receptor alpha variable 24 [Source:HGNC Symbol;Acc:HGNC:12121] +MEKNPLAAPLLILWFHLDCVSSILNVEQSPQSLHVQEGDSTNFTCSFPSSNFYALHWYRW +ETAKSPEALFVMTLNGDEKKKGRISATLNTKEGYSYLYIKGSQPEDSATYLCAF +>ENSP00000452100.1 pep chromosome:GRCh38:14:22112347:22113031:1 gene:ENSG00000211806.2 transcript:ENST00000390454.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV25 description:T cell receptor alpha variable 25 [Source:HGNC Symbol;Acc:HGNC:12122] +MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTTLSNIQWYKQRPGGH +PVFLIQLVKSGEVKKQKRLTFQFGEAKKNSSLHITATQTTDVGTYFCAG +>ENSP00000452431.1 pep chromosome:GRCh38:14:22123318:22124285:1 gene:ENSG00000211807.3 transcript:ENST00000390455.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV26-1 description:T cell receptor alpha variable 26-1 [Source:HGNC Symbol;Acc:HGNC:12123] +MRLVARVTVFLTFGTIIDAKTTQPTSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQG +PQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVRV +>ENSP00000450448.1 pep chromosome:GRCh38:14:22132553:22133034:1 gene:ENSG00000211808.3 transcript:ENST00000390456.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-7 description:T cell receptor alpha variable 8-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12152] +MLLVVILLLGMFFTLSKTQSVTQLDGHITVSEEAPLELKCNYSYSGVPSLFWYVQYSSQS +LQLLLKDLTKATQVKGIRGFEAEFKKSETSFYLRKPSTHVSDAAEYFCAVGDR +>ENSP00000451735.1 pep chromosome:GRCh38:14:22147995:22148633:1 gene:ENSG00000211809.2 transcript:ENST00000390457.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV27 description:T cell receptor alpha variable 27 [Source:HGNC Symbol;Acc:HGNC:12125] +MVLKFSVSILWIQLAWVSTQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEG +PVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQTGDTGLYLCAG +>ENSP00000452209.1 pep chromosome:GRCh38:14:22163238:22163870:1 gene:ENSG00000211810.3 transcript:ENST00000390458.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV29DV5 description:T cell receptor alpha variable 29/delta variable 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12127] +MAMLLGASVLILWLQPDWVNSQQKNDDQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFL +WYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYFCAAS +>ENSP00000451308.1 pep chromosome:GRCh38:14:22168429:22168988:1 gene:ENSG00000259092.1 transcript:ENST00000557168.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV30 description:T cell receptor alpha variable 30 [Source:HGNC Symbol;Acc:HGNC:12129] +METLLKVLSGTLLWQLTWVRSQQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHG +EAPVFLMILLKGGEQKGHDKISASFNEKKQQSSLYLTASQLSYSGTYFCGTE +>ENSP00000450865.1 pep chromosome:GRCh38:14:22304054:22304553:1 gene:ENSG00000211818.1 transcript:ENST00000390466.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV39 description:T cell receptor alpha variable 39 [Source:HGNC Symbol;Acc:HGNC:12139] +MKKLLAMILWLQLDRLSGELKVEQNPLFLSMQEGKNYTIYCNYSTTSDRLYWYRQDPGKS +LESLFVLLSNGAVKQEGRLMASLDTKARLSTLHITAAVHDLSATYFCAVD +>ENSP00000452002.1 pep chromosome:GRCh38:14:22207522:22208129:1 gene:ENSG00000211813.2 transcript:ENST00000390461.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV34 description:T cell receptor alpha variable 34 [Source:HGNC Symbol;Acc:HGNC:12133] +METVLQVLLGILGFQAAWVSSQELEQSPQSLIVQEGKNLTINCTSSKTLYGLYWYKQKYG +EGLIFLMMLQKGGEEKSHEKITAKLDEKKQQSSLHITASQPSHAGIYLCGAD +>ENSP00000452585.1 pep chromosome:GRCh38:14:22314490:22314919:1 gene:ENSG00000211819.3 transcript:ENST00000390467.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV40 description:T cell receptor alpha variable 40 [Source:HGNC Symbol;Acc:HGNC:12141] +MNSSLDFLILILMFGGTSSNSVKQTGQITVSEGASVTMNCTYTSTGYPTLFWYVEYPSKP +LQLLQRETMENSKNFGGGNIKDKNSPIVKYSVQVSDSAVYYCLLG +>ENSP00000450950.1 pep chromosome:GRCh38:14:22271968:22272563:1 gene:ENSG00000211816.2 transcript:ENST00000390464.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV38-1 description:T cell receptor alpha variable 38-1 [Source:HGNC Symbol;Acc:HGNC:12137] +MTRVSLLWAVVVSTCLESGMAQTVTQSQPEMSVQEAETVTLSCTYDTSENNYYLFWYKQP +PSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCAFMK +>ENSP00000451177.1 pep chromosome:GRCh38:14:22320188:22320691:1 gene:ENSG00000211820.1 transcript:ENST00000390468.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV41 description:T cell receptor alpha variable 41 [Source:HGNC Symbol;Acc:HGNC:12142] +MVKIRQFLLAILWLQLSCVSAAKNEVEQSPQNLTAQEGEFITINCSYSVGISALHWLQQH +PGGGIVSLFMLSSGKKKHGRLIATINIQEKHSSLHITASHPRDSAVYICAVR +>ENSP00000452332.1 pep chromosome:GRCh38:14:22281105:22281748:1 gene:ENSG00000211817.2 transcript:ENST00000390465.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV38-2DV8 description:T cell receptor alpha variable 38-2/delta variable 8 [Source:HGNC Symbol;Acc:HGNC:12138] +MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYDTSESDYYLFWYKQP +PSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDAAMYFCAYRS +>ENSP00000450970.1 pep chromosome:GRCh38:14:22202583:22203368:1 gene:ENSG00000211812.1 transcript:ENST00000390460.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV26-2 description:T cell receptor alpha variable 26-2 [Source:HGNC Symbol;Acc:HGNC:12124] +MKLVTSITVLLSLGIMGDAKTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQG +PEYVIHGLTSNVNNRMASLAIAEDRKSSTLILHRATLRDAAVYYCILRD +>ENSP00000451750.1 pep chromosome:GRCh38:14:22469041:22469698:-1 gene:ENSG00000256590.2 transcript:ENST00000535880.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRDV3 description:T cell receptor delta variable 3 [Source:HGNC Symbol;Acc:HGNC:12264] +MILTVGFSFLFFYRGTLCDKVTQSSPDQTVASGSEVVLLCTYDTVYSNPDLFWYRIRPDY +SFQFVFYGDNSRSEGADFTQGRFSVKHILTQKAFHLVISPVRTEDSATYYCAF +>ENSP00000451578.1 pep chromosome:GRCh38:14:22422371:22423042:1 gene:ENSG00000211821.2 transcript:ENST00000390469.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRDV2 description:T cell receptor delta variable 2 [Source:HGNC Symbol;Acc:HGNC:12263] +MQRISSLIHLSLFWAGVMSAIELVPEHQTVPVSIGVPATLRCSMKGEAIGNYYINWYRKT +QGNTMTFIYREKDIYGPGFKDNFQGDIDIAKNLAVLKILAPSERDEGSYYCACDT +>ENSP00000450804.1 pep chromosome:GRCh38:14:22226746:22227254:1 gene:ENSG00000211815.3 transcript:ENST00000390463.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV36DV7 description:T cell receptor alpha variable 36/delta variable 7 [Source:HGNC Symbol;Acc:HGNC:12135] +MMKCPQALLAIFWLLLSWVSSEDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQE +KKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAIYLCAVE +>ENSP00000388523.3 pep chromosome:GRCh38:7:142300924:142301432:1 gene:ENSG00000226660.2 transcript:ENST00000455382.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV2 description:T cell receptor beta variable 2 [Source:HGNC Symbol;Acc:HGNC:12195] +MDTWLVCWAIFSLLKAGLTEPEVTQTPSHQVTQMGQEVILRCVPISNHLYFYWYRQILGQ +KVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE +>ENSP00000374910.3 pep chromosome:GRCh38:7:142308542:142309048:1 gene:ENSG00000237702.2 transcript:ENST00000390387.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV3-1 description:T cell receptor beta variable 3-1 [Source:HGNC Symbol;Acc:HGNC:12212] +MGCRLLCCVVFCLLQAGPLDTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKK +FLKIMFSYNNKELIINETVPNRFSPKSPDKAHLNLHINSLELGDSAVYFCASSQ +>ENSP00000374880.3 pep chromosome:GRCh38:7:142313184:142313666:1 gene:ENSG00000211710.3 transcript:ENST00000390357.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-1 description:T cell receptor beta variable 4-1 [Source:HGNC Symbol;Acc:HGNC:12215] +MGCRLLCCAVLCLLGAVPIDTEVTQTPKHLVMGMTNKKSLKCEQHMGHRAMYWYKQKAKK +PPELMFVYSYEKLSINESVPSRFSPECPNSSLLNLHLHALQPEDSALYLCASSQ +>ENSP00000374904.3 pep chromosome:GRCh38:7:142320677:142321544:1 gene:ENSG00000211734.3 transcript:ENST00000390381.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-1 description:T cell receptor beta variable 5-1 [Source:HGNC Symbol;Acc:HGNC:12218] +MGSRLLCWVLLCLLGAGPVKAGVTQTPRYLIKTRGQQVTLSCSPISGHRSVSWYQQTPGQ +GLQFLFEYFSETQRNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYLCASSL +>ENSP00000374876.2 pep chromosome:GRCh38:7:142328297:142328786:1 gene:ENSG00000211706.2 transcript:ENST00000390353.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-1 description:T cell receptor beta variable 6-1 [Source:HGNC Symbol;Acc:HGNC:12226] +MSIGLLCCVAFSLLWASPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHNSMYWYRQDPGM +GLRLIYYSASEGTTDKGEVPNGYNVSRLNKREFSLRLESAAPSQTSVYFCASSE +>ENSP00000448600.2 pep chromosome:GRCh38:7:142332182:142332701:1 gene:ENSG00000211707.3 transcript:ENST00000547918.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-1 description:T cell receptor beta variable 7-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12235] +MGTRLLCWAAICLLGADHTGAGVSQSLRHKVAKKGKDVALRYDPISGHNALYWYRQSLGQ +GLEFPIYFQGKDAADKSGLPRDRFSAQRSEGSISTLKFQRTQQGDLAVYLCASSS +>ENSP00000374915.3 pep chromosome:GRCh38:7:142345421:142345985:1 gene:ENSG00000211745.3 transcript:ENST00000390392.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-2 description:T cell receptor beta variable 4-2 [Source:HGNC Symbol;Acc:HGNC:12216] +MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKK +PLELMFVYNFKEQTENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ +>ENSP00000488969.1 pep chromosome:GRCh38:7:142349152:142349664:1 gene:ENSG00000283063.1 transcript:ENST00000634383.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-2 description:T cell receptor beta variable 6-2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12227] +MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGM +GLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY +>ENSP00000489072.1 pep chromosome:GRCh38:7:142352819:142353358:1 gene:ENSG00000282939.1 transcript:ENST00000634605.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-2 description:T cell receptor beta variable 7-2 [Source:HGNC Symbol;Acc:HGNC:12236] +MGTRLLFWVAFCLLGADHTGAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQ +GLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL +>ENSP00000374883.3 pep chromosome:GRCh38:7:142380806:142381261:1 gene:ENSG00000211713.3 transcript:ENST00000390360.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-4 description:T cell receptor beta variable 6-4 [Source:HGNC Symbol;Acc:HGNC:12229] +MSIRLLCCVAFSLLWAGPVTAGITQAPTSQILAAGRSMTLRCTQDMRHNAMYWYRQDLGL +GLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD +>ENSP00000374884.3 pep chromosome:GRCh38:7:142384329:142384841:1 gene:ENSG00000211714.3 transcript:ENST00000390361.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-3 description:T cell receptor beta variable 7-3 [Source:HGNC Symbol;Acc:HGNC:12237] +MGTRLLCWAALCLLGADHTGAGVSQTPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQ +GPEFLIYFQGTGAADDSGLPKDRFFAVRPEGSVSTLKIQRTEQGDSAAYLRASSL +>ENSP00000374885.1 pep chromosome:GRCh38:7:142389202:142389668:1 gene:ENSG00000211715.1 transcript:ENST00000390362.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-3 description:T cell receptor beta variable 5-3 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12220] +MGPGLLCWELLYLLGAGPVEAGVTQSPTHLIKTRGQQVTLRCSPISGHSSVSWYQQAPGQ +GPQFIFEYANELRRSEGNFPNRFSGRQFHDYCSEMNVSALELGDSALYLCARSL +>ENSP00000374886.2 pep chromosome:GRCh38:7:142391891:142392412:1 gene:ENSG00000211716.2 transcript:ENST00000390363.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV9 description:T cell receptor beta variable 9 [Source:HGNC Symbol;Acc:HGNC:12246] +MGFRLLCCVAFCLLGAGPVDSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQ +GLQFLIHYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV +>ENSP00000374887.3 pep chromosome:GRCh38:7:142399860:142400377:1 gene:ENSG00000211717.3 transcript:ENST00000390364.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-1 description:T cell receptor beta variable 10-1(gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12177] +MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGH +GLRLIHYSYGVHDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE +>ENSP00000374890.3 pep chromosome:GRCh38:7:142407672:142408136:1 gene:ENSG00000211720.3 transcript:ENST00000390367.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-1 description:T cell receptor beta variable 11-1 [Source:HGNC Symbol;Acc:HGNC:12180] +MSTRLLCWMALCLLGAELSEAEVAQSPRYKITEKSQAVAFWCDPISGHATLYWYRQILGQ +GPELLVQFQDESVVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAMYLCASSL +>ENSP00000404652.2 pep chromosome:GRCh38:7:142424965:142425465:1 gene:ENSG00000229769.2 transcript:ENST00000426318.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-2 description:T cell receptor beta variable 10-2 [Source:HGNC Symbol;Acc:HGNC:12178] +MGTRLFFYVALCLLWAGHRDAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGH +GLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE +>ENSP00000374891.2 pep chromosome:GRCh38:7:142450947:142451448:1 gene:ENSG00000211721.2 transcript:ENST00000390368.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-5 description:T cell receptor beta variable 6-5 [Source:HGNC Symbol;Acc:HGNC:12230] +MSIGLLCCAALSLLWAGPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGM +GLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSY +>ENSP00000374892.2 pep chromosome:GRCh38:7:142455174:142455635:1 gene:ENSG00000253409.1 transcript:ENST00000390369.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-4 description:T cell receptor beta variable 7-4 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12238] +MGTRLLCWVVLGFLGTDHTGAGVSQSPRYKVAKRGRDVALRCDSISGHVTLYWYRQTLGQ +GSEVLTYSQSDAQRDKSGRPSGRFSAERPERSVSTLKIQRTEQGDSAVYLCASSL +>ENSP00000413966.2 pep chromosome:GRCh38:7:142462916:142463581:1 gene:ENSG00000230099.2 transcript:ENST00000454561.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-4 description:T cell receptor beta variable 5-4 [Source:HGNC Symbol;Acc:HGNC:12221] +MGPGLLCWALLCLLGAGSVETGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQ +GPQFIFQYYREEENGRGNFPPRFSGLQFPNYSSELNVNALELDDSALYLCASSL +>ENSP00000374894.3 pep chromosome:GRCh38:7:142469537:142470013:1 gene:ENSG00000211724.3 transcript:ENST00000390371.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-6 description:T cell receptor beta variable 6-6 [Source:HGNC Symbol;Acc:HGNC:12231] +MSISLLCCAAFPLLWAGPVNAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGM +GLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSY +>ENSP00000374895.3 pep chromosome:GRCh38:7:142482548:142483019:1 gene:ENSG00000211725.3 transcript:ENST00000390372.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-5 description:T cell receptor beta variable 5-5 [Source:HGNC Symbol;Acc:HGNC:12222] +MGPGLLCWVLLCLLGAGPVDAGVTQSPTHLIKTRGQQVTLRCSPISGHKSVSWYQQVLGQ +GPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASSL +>ENSP00000374896.2 pep chromosome:GRCh38:7:142487863:142488295:1 gene:ENSG00000253188.1 transcript:ENST00000390373.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-7 description:T cell receptor beta variable 6-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12232] +MSLGLLCCVAFSLLWAGPMNAGVTQTPKFHVLKTGQSMTLLCAQDMNHEYMYRYRQDPGK +GLRLIYYSVAAALTDKGEVPNGYNVSRSNTEDFPLKLESAAPSQTSVYFCASSY +>ENSP00000374897.3 pep chromosome:GRCh38:7:142492132:142492673:1 gene:ENSG00000211727.3 transcript:ENST00000390374.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-6 description:T cell receptor beta variable 7-6 [Source:HGNC Symbol;Acc:HGNC:12240] +MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQ +GPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL +>ENSP00000374898.2 pep chromosome:GRCh38:7:142500028:142500534:1 gene:ENSG00000211728.2 transcript:ENST00000390375.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-6 description:T cell receptor beta variable 5-6 [Source:HGNC Symbol;Acc:HGNC:12223] +MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKSGHDTVSWYQQALGQ +GPQFIFQYYEEEERQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYLCASSL +>ENSP00000374899.2 pep chromosome:GRCh38:7:142507382:142507810:1 gene:ENSG00000253534.1 transcript:ENST00000390376.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-8 description:T cell receptor beta variable 6-8 [Source:HGNC Symbol;Acc:HGNC:12233] +MSLGLLCCAAFSLLWAGPVNAGVTQTPKFHILKTGQSMTLQCAQDMNHGYMSWYRQDPGM +GLRLIYYSAAAGTTDKEVPNGYNVSRLNTEDFPLRLVSAAPSQTSVYLCASSY +>ENSP00000374900.1 pep chromosome:GRCh38:7:142511626:142512127:1 gene:ENSG00000253291.1 transcript:ENST00000390377.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-7 description:T cell receptor beta variable 7-7 [Source:HGNC Symbol;Acc:HGNC:12241] +MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVTLRCDPISSHATLYWYQQALGQ +GPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL +>ENSP00000374901.1 pep chromosome:GRCh38:7:142520090:142520556:1 gene:ENSG00000211731.1 transcript:ENST00000390378.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-7 description:T cell receptor beta variable 5-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12224] +MGPGLLCWVLLCPLGEGPVDAGVTQSPTHLIKTRGQHVTLRCSPISGHTSVSSYQQALGQ +GPQFIFQYYEKEERGRGNFPDQFSGHQFPNYSSELNVNALLLGDSALYLCASSL +>ENSP00000478301.1 pep chromosome:GRCh38:7:142529290:142529762:1 gene:ENSG00000278030.1 transcript:ENST00000612787.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-9 description:T cell receptor beta variable 7-9 [Source:HGNC Symbol;Acc:HGNC:12243] +MGTSLLCWMALCLLGADHADTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQ +GPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL +>ENSP00000477580.1 pep chromosome:GRCh38:7:142535809:142536292:1 gene:ENSG00000276405.1 transcript:ENST00000614171.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV13 description:T cell receptor beta variable 13 [Source:HGNC Symbol;Acc:HGNC:12188] +MLSPDLPDSAWNTRLLCRVMLCLLGAGSVAAGVIQSPRHLIKEKRETATLKCYPIPRHDT +VYWYQQGPGQDPQFLISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFC +ASSL +>ENSP00000479267.1 pep chromosome:GRCh38:7:142544212:142544685:1 gene:ENSG00000275791.1 transcript:ENST00000611462.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-3 description:T cell receptor beta variable 10-3 [Source:HGNC Symbol;Acc:HGNC:12179] +MGTRLFFYVALCLLWTGHMDAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGH +GLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE +>ENSP00000480080.1 pep chromosome:GRCh38:7:142554836:142555318:1 gene:ENSG00000276597.1 transcript:ENST00000611787.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-3 description:T cell receptor beta variable 11-3 [Source:HGNC Symbol;Acc:HGNC:12182] +MGTRLLCWVAFCLLVEELIEAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYLQNLGQ +GPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL +>ENSP00000477916.1 pep chromosome:GRCh38:7:142560423:142560931:1 gene:ENSG00000274752.1 transcript:ENST00000620569.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-3 description:T cell receptor beta variable 12-3 [Source:HGNC Symbol;Acc:HGNC:12185] +MDSWTFCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMR +GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL +>ENSP00000480999.1 pep chromosome:GRCh38:7:142563740:142564245:1 gene:ENSG00000276953.1 transcript:ENST00000617347.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-4 description:T cell receptor beta variable 12-4 [Source:HGNC Symbol;Acc:HGNC:12186] +MGSWTLCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMR +GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL +>ENSP00000479506.1 pep chromosome:GRCh38:7:142580917:142581427:1 gene:ENSG00000275158.1 transcript:ENST00000621184.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-5 description:T cell receptor beta variable 12-5 [Source:HGNC Symbol;Acc:HGNC:12187] +MATRLLCCVVLCLLGEELIDARVTQTPRHKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQ +GLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGL +>ENSP00000477671.1 pep chromosome:GRCh38:7:142587868:142588359:1 gene:ENSG00000275743.1 transcript:ENST00000617639.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV14 description:T cell receptor beta variable 14 [Source:HGNC Symbol;Acc:HGNC:12189] +MVSRLLSLVSLCLLGAKHIEAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGK +EIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQ +>ENSP00000482333.1 pep chromosome:GRCh38:7:142592928:142593473:1 gene:ENSG00000276819.1 transcript:ENST00000616518.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV15 description:T cell receptor beta variable 15 [Source:HGNC Symbol;Acc:HGNC:12190] +MGPGLLHWMALCLLGTGHGDAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQ +APKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYLCATSR +>ENSP00000479210.1 pep chromosome:GRCh38:7:142598016:142598469:1 gene:ENSG00000275243.1 transcript:ENST00000620773.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV16 description:T cell receptor beta variable 16 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12191] +MSPIFTCITILCLLAAGSPGEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKN +EFKFLISFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASSQ +>ENSP00000483468.1 pep chromosome:GRCh38:7:142601628:142602360:1 gene:ENSG00000277880.1 transcript:ENST00000619103.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV17 description:T cell receptor beta variable 17 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12192] +MDIWLLCWVTLCLLAAGHSEPGVSQTPRHKVTNMGQEVILRCDPSSGHMFVHWYRQNLRQ +EMKLLISFQYQNIAVDSGMPKERFTAERPNGTSSTLKIHPAEPRDSAVYLYSSG +>ENSP00000483504.1 pep chromosome:GRCh38:7:142615716:142616415:1 gene:ENSG00000276557.1 transcript:ENST00000611520.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV18 description:T cell receptor beta variable 18 [Source:HGNC Symbol;Acc:HGNC:12193] +MDTRLLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEE +GLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSP +>ENSP00000374916.3 pep chromosome:GRCh38:7:142618849:142619532:1 gene:ENSG00000211746.3 transcript:ENST00000390393.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV19 description:T cell receptor beta variable 19 [Source:HGNC Symbol;Acc:HGNC:12194] +MSNQVLCCVVLCLLGANTVDGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQ +GLRLIYYSQIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI +>ENSP00000374917.3 pep chromosome:GRCh38:7:142626649:142627399:1 gene:ENSG00000211747.3 transcript:ENST00000390394.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV20-1 description:T cell receptor beta variable 20-1 [Source:HGNC Symbol;Acc:HGNC:12196] +MLLLLLLLGPGSGLGAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML +MATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR +>ENSP00000374919.1 pep chromosome:GRCh38:7:142645961:142646467:1 gene:ENSG00000211749.1 transcript:ENST00000390396.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV23-1 description:T cell receptor beta variable 23-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12201] +MGTRLLGCAALCLLAADSFHAKVTQTPGHLVKGKGQKTKMDCTPEKGHTFVYWYQQNQNK +EFMLLISFQNEQVLQETEMHKKRFSSQCPKNAPCSLAILSSEPGDTALYLCASSQ +>ENSP00000374920.2 pep chromosome:GRCh38:7:142656701:142657213:1 gene:ENSG00000211750.2 transcript:ENST00000390397.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV24-1 description:T cell receptor beta variable 24-1 [Source:HGNC Symbol;Acc:HGNC:12203] +MASLLFFCGAFYLLGTGSMDADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGL +GLQLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL +>ENSP00000374921.3 pep chromosome:GRCh38:7:142670740:142671244:1 gene:ENSG00000282499.1 transcript:ENST00000390398.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV25-1 description:T cell receptor beta variable 25-1 [Source:HGNC Symbol;Acc:HGNC:12205] +MTIRLLCYVGFYFLGAGLMEADIYQTPRYLVIGTGKKITLECSQTMGHDKMYWYQQDPGM +ELHLIHYSYGVNSTEKGDLSSESTVSRIRTEHFPLTLESARPSHTSQYLCASSE +>ENSP00000374922.3 pep chromosome:GRCh38:7:142715346:142715861:1 gene:ENSG00000211752.3 transcript:ENST00000390399.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV27 description:T cell receptor beta variable 27 [Source:HGNC Symbol;Acc:HGNC:12208] +MGPQLLGYVVLCLLGAGPLEAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGL +GLRQIYYSMNVEVTDKGDVPEGYKVSRKEKRNFPLILESPSPNQTSLYFCASSL +>ENSP00000397118.2 pep chromosome:GRCh38:7:142812586:142813399:-1 gene:ENSG00000237254.2 transcript:ENST00000417977.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV30 description:T cell receptor beta variable 30 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12214] +MLCSLLALLLGTFFGVRSQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRG +LQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS +>ENSP00000374923.2 pep chromosome:GRCh38:7:142720660:142721160:1 gene:ENSG00000211753.4 transcript:ENST00000390400.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV28 description:T cell receptor beta variable 28 [Source:HGNC Symbol;Acc:HGNC:12209] +MGIRLLCRVAFCFLAVGLVDVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGL +GLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASSL +>ENSP00000395459.2 pep chromosome:GRCh38:7:142740206:142740894:1 gene:ENSG00000232869.2 transcript:ENST00000422143.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV29-1 description:T cell receptor beta variable 29-1 [Source:HGNC Symbol;Acc:HGNC:12210] +MLSLLLLLLGLGSVFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTL +IATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVE +>ENSP00000481428.1 pep chromosome:GRCh38:CHR_HSCHR19_4_CTG3_1:54840878:54856485:1 gene:ENSG00000273931.1 transcript:ENST00000610808.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:KIR2DS4 description:killer cell immunoglobulin like receptor, two Ig domains and short cytoplasmic tail 4 [Source:HGNC Symbol;Acc:HGNC:6336] +MSLMVIIMACVGFFLLQGAWPQEGVHRKPSFLALPGHLVKSEETVILQCWSDVMFEHFLL +HREGKFNNTLHLIGEHHDGVSKANFSIGPMMPVLAGTYRCYGSVPHSPYQLSAPSDPLDM +VIIGLYEKPSLSAQPGPTVQAGENVTLSCSSIYPGKGRPMNVGSLQCAASTEHSRPTFLW +ALPPTEGPTDASALSVTLPTSGQTRVIHCLFPSQETLQIVGLHPLNQAPKPVTPDTYMF* +LGPQWSKSLSPSSSSFSFIAGAPTKKMLL*WTKSLQGTEQ*TARILMNKTIRRCHTH +>ENSP00000492265.1 pep chromosome:GRCh38:8:43292483:43363319:1 gene:ENSG00000188877.12 transcript:ENST00000522175.7 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:POTEA description:POTE ankyrin domain family member A [Source:HGNC Symbol;Acc:HGNC:33893] +MVAEVSPKLAASPMKKPFGFRGKMGKWCCCCFPCCRGSGKNNMGAWRDHDDSAFTEPRYH +VRREDLGKLHRAAWWGEVPRADLIVMLRGPGINKRDKKKRTALHLACANGNSEVVSLLLD +RQCQLHVFDSKKRTALIKAVQCQEDECALMLLQHGTDPNLPDMYGNTALHYAVYNEDKLM +AKTLLLYGADIESKNKGGLTPLLLAVHGQKQRMVKFLIKKKANLNALDRFGRICQLLSDY +KENQMPNNSSGNSNPEQDLKLTSEEEPQRLKGSENSQHEKVTQEPDINKDCDREVEEEMQ +KHGSNNVGLSENLTDGAAAGNGDGGLVPQRKSRKHENQQFPNTEIEEYHRPEKKSNEKNK +VKSQIHSVDNLDDITWPSEIASEDYDLLFSNYETFTLLIEQLKMDFNDSASLSKIQDAVI +SEEHLLELKNSHYEQLTVEVEQMENMVHVLQK*LSEAKETQLQLAPQKGECEQERYSSSE +EQNDTRKQLSKEQNARILQDEILTTKQKQIEVAEKKMNFEISLSHKEEKELLHENSMMQE +EIAMLRIELDTIKHQNQLREKKYLEYIKSVKEKNDNLLKAIQLNEEALTKAVVQYSGQLS +ILTTENKMLSFELQNVRHNNETLEMEIQSCHFRLATALHDCD +>ENSP00000492193.1 pep chromosome:GRCh38:8:43292483:43363518:1 gene:ENSG00000188877.12 transcript:ENST00000519951.2 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:POTEA description:POTE ankyrin domain family member A [Source:HGNC Symbol;Acc:HGNC:33893] +MVAEVSPKLAASPMKKPFGFRGKMGKWCCCCFPCCRGSGKNNMGAWRDHDDSAFTEPRYH +VRREDLGKLHRAAWWGEVPRADLIVMLRGPGINKRDKKKRTALHLACANGNSEVVSLLLD +RQCQLHVFDSKKRTALIKAVQCQEDECALMLLQHGTDPNLPDMYGNTALHYAVYNEDKLM +AKTLLLYGADIESKNKGGLTPLLLAVHGQKQRMVKFLIKKKANLNALDRFGRTALILAVR +CGSASIVSLLLQQNIDVFSQDVFGQTAEDYAVSSHHSIICQLLSDYKENQMPNNSSGNSN +PEQDLKLTSEEEPQRLKGSENSQHEKVTQEPDINKDCDREVEEEMQKHGSNNVGLSENLT +DGAAAGNGDGGLVPQRKSRKHENQQFPNTEIEEYHRPEKKSNEKNKVKSQIHSVDNLDDI +TWPSEIASEDYDLLFSNYETFTLLIEQLKMDFNDSASLSKIQDAVISEEHLLELKNSHYE +QLTVEVEQMENMVHVLQK*LSEAKETQLQLAPQKGECEQERYSSSEEQNDTRKQLSKEQN +ARILQDEILTTKQKQIEVAEKKMNFEISLSHKEEKELLHENSMMQEEIAMLRIELDTIKH +QNQLREKKYLEYIKSVKEKNDNLLKAIQLNEEALTKAVVQYSGQLSILTTENKMLSFELQ +NVRHNNETLEMEIQSCHFRLATALHDCD +>ENSP00000477333.1 pep chromosome:GRCh38:9:36002909:36003867:-1 gene:ENSG00000243641.3 transcript:ENST00000424348.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR13C7 description:olfactory receptor family 13 subfamily C member 7 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15102] +MVSANQTASVTEFILLGLSAHPKLEKTFFVLILLMYLVILLGNGVLILMTVSNSHLHMPM +YFFLGNLSFLDICYTTSSVPLILDSFLTPRKTISFSACAVQMFLSFAMGATECVLLSMMA +FDRYVAICNPLRYPVVMSKAAYMPIRLPAPG*LEALLPWCRHPLQ*GCPSVETTSSTTSP +VRFWLS*SWPVLISLSM*SVWE*PM*SSWGSRFCSSLSPMSSSLPPS*GSPQLRGGKRPS +PPALPTSQSWSSSMGPSSSCMGSPSLRTRWGQTSKTLQTNSFPFSMGW*PPCSTPSSTA* +GTRM*RLL*GT*YFRNALP +>ENSP00000426627.1 pep chromosome:GRCh38:1:161581340:161599803:1 gene:ENSG00000244682.7 transcript:ENST00000466542.6 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:FCGR2C description:Fc fragment of IgG receptor IIc (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15626] +MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAAPPKAVLKLEP*WIN +VLQEDSVTLTCRGTHSPESDSIPWFHNGNLIPTHTQPSYRFKANNNDSGEYTCQTGQTSL +SDPVHLTVLSEWLVLQTPHLEFQEGETIVLRCHSWKDKPLVKVTFFQNGKSKKFSRSDPN +FSIPQANHSHSGDYHCTGNIGYTLYSSKPVTITVQAPSSSPMGIIVAVVTGIAVAAIVAA +VVALIYCRKKRISANSTDPVKAAQFEPPGRQMIAIRKRQPEETNNDYETADGGYMTLNPR +APTDDDKNIYLTLPPNDHVNSNN +>ENSP00000480953.1 pep chromosome:GRCh38:1:161581436:161599828:1 gene:ENSG00000244682.7 transcript:ENST00000611236.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:FCGR2C description:Fc fragment of IgG receptor IIc (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15626] +MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAAPPKAVLKLEPWINV +LQEDSVTLTCRGTHSPESDSIPWFHNGNLIPTHTQPSYRFKANNNDSGEYTCQTGQTSLS +DPVHLTVLSEWLVLQTPHLEFQEGETIVLRCHSWKDKPLVKVTFFQNGKSKKFSRSDPNF +SIPQANHSHSGDYHCTGNIGYTLYSSKPVTITVQAPSSSPMGIIVAVVTGIAVAAIVAAV +VALIYCRKKRISATWTSNDCHQKETT +>ENSP00000444663.2 pep chromosome:GRCh38:1:161581339:161600242:1 gene:ENSG00000244682.7 transcript:ENST00000543859.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:FCGR2C description:Fc fragment of IgG receptor IIc (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15626] +MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAAPPKAVLKLEPWINV +LQEDSVTLTCRGTHSPESDSIPWFHNGNLIPTHTQPSYRFKANNNDSGEYTCQTGQTSLS +DPVHLTVLSEWLVLQTPHLEFQEGETIVLRCHSWKDKPLVKVTFFQNGKSKKFSRSDPNF +SIPQANHSHSGDYHCTGNIGYTLYSSKPVTITVQAPSSSPMGIIVAVVTGIAVAAIVAAV +VALIYCRKKRISANSTDPVKAAQFEPPGRQMIAIRKRQPEETNNDYETADGGYMTLNPRA +PTDDDKNIYLTLPPNDHVNSNN +>ENSP00000427945.1 pep chromosome:GRCh38:8:141433829:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000523857.5 gene_biotype:polymorphic_pseudogene transcript_biotype:nonsense_mediated_decay gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976] +MDRQCSERPYSCTPTGRVSSAVSQNSSHRLQDAAGHEQC +>ENSP00000429433.1 pep chromosome:GRCh38:8:141433829:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000521053.5 gene_biotype:polymorphic_pseudogene transcript_biotype:nonsense_mediated_decay gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976] +MDRQCSERPYSCTPTGRVSSAVSQNSSHRLQDAAGHEQC +>ENSP00000431031.1 pep chromosome:GRCh38:8:141433832:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000430863.5 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976] +MDRQCSERPYSCTPTGRVSSAVSQNSRISPPVSTSMKDSSCMKVHQDSARRDRWSHPTTI +LLHKSQSSQATLMLQEHRMFMGEAYSAATGFKMLQDMNSADPFHLKYIIKKIKNMAHGSP +KLVMETIHDYFIDNPEISSRHKFRLFQTLEMVIGASDVLEETWEKTFTRLALENMTKATE +LEDIYQDAASNMLVAICRHSWRVVAQHLETELLTGVFPHRSLLYVMGVLSSSEELFSQED +KACWEEQLIQMAIKSVPFLSTDVWSKELLWTLTTPSWTQQEQSPEKAFLFTYYGLILQAE +KNGATVRRHLQALLETSHQWPKQREGMALTLGLAATRHLDDVWAVLDQFGRSRPIRWSLP +SSSPKNSEDLRWKWASSTILLAYGQVAAKARAHILPWVDNIVSRMVFYFHYSSWDETLKQ +SFLTATLMLMGAVSRSEGAHSYEFFQTSELLQCLMVLMEKEPQDTLCTRSRQQAMHIASS +LCKLRPPIDLERKSQLLSTCFRSVFALPLLDALEKHTCLFLEPPNIQLWPVARERAGWTH +QGWGPRAVLHCSEHLQSLYSRTMEALDFMLQSLIMQNPTADELHFLLSHLYIWLASEKAH +ERQRAVHSCMILLKFLNHNGYLDPKEDFKRIGQLVGILGMLCQDPDRATQRCSLEGASHL +YQLLMCHKTGEALQAESQAPKELSQAHSDGAPLWNSRDQKATPLGPQEMAKNHIFQLCSF +QVIKDIMQQLTLAELSDLIWTAIDGLGSTSPFRVQAASEMLLTAVQEHGAKLEIVSSMAQ +AIRLRLCSVHIPQAKEKTLHAITLLARSHTCELVATFLNISIPLDSHTFQLWRALGAGQP +TSHLVLTTLLACLQERPLPTGASDSSPCPKEKTYLRLLAAMNMLHELQFAREFKQAVQEG +YPKLFLALLTQMHYVLELNLPSEP*PKQQAQEAAVPSPQSCSTSLEALKSLLSTTGHWHD +FAHLELQGSWELFTTIHTYPKGVGLLARAMVQNHCRQIPAVLRQLLPSLQSPQERERKVA +ILILTKFLYSPVLLEVLPKQAALTVLAQGLHDPSPEVRVLSLQGLSNILFHPDKGSLLQG +QLRPLLDGFFQSSDQVIVCIMGTVSDTLHRLGAQGTGSQSLGVAISTRSFFNDERDGIRA +AAMALFGDLVAAMADRELSGLRTQVHQSMVPLLLHLKDQCPAVATQAKFTFYRCAVLLRW +RLLHTLFCTLAWERGLSARHFLWTCLMTRSQEEFSIHLSQALSYLHSHSCHIKTWVTLFI +GHTICYHPQAVFQMLNAVDTNLLFRTFEHLRSDPEPSIREFATSQLSFLQKVSARPKQ +>ENSP00000429440.1 pep chromosome:GRCh38:8:141494911:141496759:-1 gene:ENSG00000226807.6 transcript:ENST00000521161.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976] +MKDSSCMKVHQDSARRDRWSHPTTILLHKSQSSQATLMLQEHRMFMGEAYSAATGFKMLQ +DMNSADPFHLKYIIKKIKNMAHGSPKLVMETIHDYFIDNPEISSRHKFRL +>ENSP00000481783.1 pep chromosome:GRCh38:8:141433829:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000621837.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976] +MDRQCSERPYSCTPTGRVSSAVSQNSRISPPVSTSMKDSSCMKVHQDSARRDRWSHPTTI +LLHKSQSSQATLMLQEHRMFMGEAYSAATGFKMLQDMNSADPFHLKYIIKKIKNMAHGSP +KLVMETIHDYFIDNPEISSRHKFRLFQTLEMVIGASDVLEETWEKTFTRLALENMTKATE +LEDIYQDAASNMLVAICRHSWRVVAQHLETELLTGVFPHRSLLYVMGVLSSSEELFSQED +KACWEEQLIQMAIKSVPFLSTDVWSKELLWTLTTPSWTQQEQSPEKAFLFTYYGLILQAE +KNGATVRRHLQALLETSHQWPKQREGMALTLGLAATRHLDDVWAVLDQFGRSRPIRWSLP +SSSPKNSEDLRWKWASSTILLAYGQVAAKARAHILPWVDNIVSRMVFYFHYSSWDETLKQ +SFLTATLMLMGAVSRSEGAHSYEFFQTSELLQCLMVLMEKEPQDTLCTRSRQQAMHIASS +LCKLRPPIDLERKSQLLSTCFRSVFALPLLDALEKHTCLFLEPPNIQLWPVARERAGWTH +QGWGPRAVLHCSEHLQSLYSRTMEALDFMLQSLIMQNPTADELHFLLSHLYIWLASEKAH +ERQRAVHSCMILLKFLNHNGYLDPKEDFKRIGQLVGILGMLCQDPDRATQRCSLEGASHL +YQLLMCHKTGEALQAESQAPKELSQAHSDGAPLWNSRDQKATPLGPQEMAKNHIFQLCSF +QVIKDIMQQLTLAELSDLIWTAIDGLGSTSPFRVQAASEMLLTAVQEHGAKLEIVSSMAQ +AIRLRLCSVHIPQAKEKTLHAITLLARSHTCELVATFLNISIPLDSHTFQLWRALGAGQP +TSHLVLTTLLACLQERPLPTGASDSSPCPKEKTYLRLLAAMNMLHELQFAREFKQAVQEG +YPKLFLALLTQMHYVLELNLPSEPPKQQAQEAAVPSPQSCSTSLEALKSLLSTTGHWHDF +AHLELQGSWELFTTIHTYPKGVGLLARAMVQNHCRQIPAVLRQLLPSLQSPQERERKVAI +LILTKFLYSPVLLEVLPKQAALTVLAQGLHDPSPEVRVLSLQGLSNILFHPDKGSLLQGQ +LRPLLDGFFQSSDQVIVCIMGTVSDTLHRLGAQGTGSQSLGVAISTRSFFNDERDGIRAA +AMALFGDLVAAMADRELSGLRTQVHQSMVPLLLHLKDQCPAVATQAKFTFYRCAVLLRWR +LLHTLFCTLAWERGLSARHFLWTCLMTRSQEEFSIHLSQALSYLHSHSCHIKTWVTLFIG +HTICYHPQAVFQMLNAVDTNLLFRTFEHLRSDPEPSIREFATSQLSFLQKVSARPKQ +>ENSP00000485975.1 pep chromosome:GRCh38:CHR_HG142_HG150_NOVEL_TEST:56210873:56211820:-1 gene:ENSG00000263150.3 transcript:ENST00000570683.3 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR8J2 description:olfactory receptor family 8 subfamily J member 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15311] +MASGNLTWVTEFILVGVSDDPELQIPLFLVFLVLYLLTVAGNLGIITLTSVDPQLQTPMY +FFL*HLAIINLCNSTVVAPKMLVNFLVTKKTISYYGCAAQLGGFLVFIVAEIFTLAAMAY +DRYVAIWSPLLYAVVVSPKVCRLLVSLTYLQSLITALTVSSCVFSVSYCSSNIINHFYCD +DVPLLALSCSDTYIPETAVFIFSGTNLLFSMIVVLISYFNIVITILRIRSSEGRQKAFST +CASHMIAVVVFYGTLLFMYLQPRSNHSLDTDKMASVFYTLVIPVLNPLIYSLRNKNVKDA +LKRFLDNPCRSLKLM +>ENSP00000460880.1 pep chromosome:GRCh38:CHR_HG142_HG150_NOVEL_TEST:56318307:56319245:1 gene:ENSG00000262755.1 transcript:ENST00000573400.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR8K3 description:olfactory receptor family 8 subfamily K member 3 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15313] +MEQHNLTTVNEFILTGITDIAELQAPLFALFLMIYVISVMGNLGMIVLTKLDSRLQTPMY +FFLRHLAFMDLGYSTTVGPKMLVNFVVDKNIISYYFCATQLAFFLVFIGSELFILSAMSY +DLYVAICNPLLYTVIMSRRVCQVLVAIPYLYCTFISLLVTIKIFTLSFCGYNVISHFYCD +SLPLLPLLCSNTHEIELIILIFAAIDLISSLLIVLLSYLLILVAILRMNSAGRQKAFSTC +GAHLTVVIVFYGTLLFMYVQPKSSHSFDTDKVASIFYTLVIPMLNPLIYSLRNKDVKYAL +RRTWNNLCNIFV +>ENSP00000468117.2 pep chromosome:GRCh38:10:116621306:116645143:1 gene:ENSG00000266200.6 transcript:ENST00000591655.3 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PNLIPRP2 description:pancreatic lipase related protein 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:9157] +MLPPWTLGLLLLATVRGKEVCYGQLGCFSDEKPWAGTLQRPVKLLPWSPEDIDTRFLLYT +NENPNNFQLITGTEPDTIEASNFQLDRKTRFIIHGFLDKAEDSWPSDMCKKMFEVEKVNC +ICVDWRHGSRAMYTQAVQNIRVVGAETAFLIQALSTQLGYSLEDVHVIGHSLGAHTAAEA +GRRLGGRVGRITGLDPAGPCFQDEPEEVRLDPSDAVFVDVIHTDSSPIVPSLGFGMSQKV +GHLDFFPNGGKEMPGCKKNVLSTITDIDGIWEGIGGFVSCNHLRSFEYYSSSVLNPDGFL +GYPCASYDEFQESKCFPCPAEGCPKMGHYADQFKGKTSAVEQTFFLNTGESGNFTSWRYK +ISVTLSGKEKVNGYIRIALYGSNENSKQYEIFKGSLKPDASHTCAIDVDFNVGKIQKVKF +LWNKRGINLSEPKLGASQITVQSGEDGTEYNFCSSDTVEENVLQSLYPC +>ENSP00000480815.1 pep chromosome:GRCh38:10:116621306:116645097:1 gene:ENSG00000266200.6 transcript:ENST00000611850.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PNLIPRP2 description:pancreatic lipase related protein 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:9157] +MLPPWTLGLLLLATVRGKEVCYGQLGCFSDEKPWAGTLQRPVKLLPWSPEDIDTRFLLYT +NENPNNFQLITGTEPDTIEASNFQLDRKTRFIIHGFLDKAEDSWPSDMCKKMFEVEKVNC +ICVDWRHGSRAMYTQAVQNIRVVGAETAFLIQALSTQLGYSLEDVHVIGHSLGAHTAAEA +GRRLGGRVGRITGLDPAGPCFQDEPEEVRLDPSDAVFVDVIHTDSSPIVPSLGFGMSQKV +GHLDFFPNGGKEMPGCKKNVLSTITDIDGIWEGIGGFVSCNHLRSFEYYSSSVLNPDGFL +GYPCASYDEFQESKCFPCPAEGCPKMGHYADQFKGKTSAVEQTFFLNTGESGNFTSRYKI +SVTLSGKEKVNGYIRIALYGSNENSKQYEIFKGSLKPDASHTCAIDVDFNVGKIQKVKFL +WNKRGINLSEPKLGASQITVQSGEDGTEYNFCSSDTVEENVLQSLYPC +>ENSP00000463502.4 pep chromosome:GRCh38:10:116620953:116645143:1 gene:ENSG00000266200.6 transcript:ENST00000579578.6 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:PNLIPRP2 description:pancreatic lipase related protein 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:9157] +MLPPWTLGLLLLATVRGKEVCYGQLGCFSDEKPWAGTLQRPVKLLPWSPEDIDTRFLLYT +NENPNNFQLITGTEPDTIEASNFQLDRKTRFIIHGFLDKAEDSWPSDMCKKMFEVEKVNC +ICVDWRHGSRAMYTQAVQNIRVVGAETAFLIQALSTQLGYSLEDVHVIGHSLGAHTAAEA +GRRLGGRVGRITGLDPAGPCFQDEPEEVRLDPSDAVFVDVIHTDSSPIVPSLGFGMSQKV +GHLDFFPNGGKEMPGCKKNVLSTITDIDGIWEGIGGFVSCNHLRSFEYYSSSVLNPDGFL +GYPCASYDEFQESKCFPCPAEGCPKMGHYADQFKGKTSAVEQTFFLNTGESGNFTS*RYK +ISVTLSGKEKVNGYIRIALYGSNENSKQYEIFKGSLKPDASHTCAIDVDFNVGKIQKVKF +LWNKRGINLSEPKLGASQITVQSGEDGTEYNFCSSDTVEENVLQSLYPC +>ENSP00000485881.1 pep chromosome:GRCh38:CHR_HG151_NOVEL_TEST:56830655:56831590:-1 gene:ENSG00000281107.1 transcript:ENST00000631283.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR5G3 description:olfactory receptor family 5 subfamily G member 3 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15287] +MEDKNQTVVTEFLLLGLTDHPYQKIVLFFMFLFVYLITLGGNLGMITLIWIDPRLHTPMY +FFLRHLSFVDICSSSSVVPKMLCNIFAEKKDITFLGCAAQMWFFGLFEAAECFLWLPWHM +TGMWPSASPCCIRSLCLSRSVCSWW*GLMPWLL*AP*LIQFSLFAYPFVVQILSITFSVI +FFHCFP*HVQTPG*INLCCLSWLEL*EYSVV*SSWSPIFAS**PS*RSRLLMGSKKLSSP +VFLTLRLSPSCMGLFS*FMFGQVQVPPWVSIK*FLYFILW*SPWLTPLFTA*GIRR*KMH +SEEKLRGKNLL +>ENSP00000407375.1 pep chromosome:GRCh38:3:49357176:49358353:-1 gene:ENSG00000233276.5 transcript:ENST00000419783.3 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN +ELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGA +GAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYS +RRFQTIDIEPDIEALLSQGPSCA +>ENSP00000391316.1 pep chromosome:GRCh38:3:49357178:49358312:-1 gene:ENSG00000233276.5 transcript:ENST00000419349.2 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN +ELQRRLGPRGLVVLGFPCNQFGHQVRRAERGGAGADVQ +>ENSP00000495108.1 pep chromosome:GRCh38:3:49357201:49358325:-1 gene:ENSG00000233276.5 transcript:ENST00000643797.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFE +PNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEK +FLVGPDGVPLRRYSRRFQTIDIEPDIEALLSQGPSCA +>ENSP00000495001.1 pep chromosome:GRCh38:3:49357477:49358325:-1 gene:ENSG00000233276.5 transcript:ENST00000646881.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASL*GTTVRDYTQMN +ELQRRLGPRGLVVLGFPCNQFGHQLPFSPVGERQERRDSEFPQVRPAWWWVRAQLHALRE +VRGERCGGAPSLRLPAGGPASSQRRRHRAYDRPQAHHLVSGVSQRCCLEL +>ENSP00000493593.1 pep chromosome:GRCh38:3:49357608:49358325:-1 gene:ENSG00000233276.5 transcript:ENST00000496791.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASL*GTTVRDYTQMN +ELQRRLGPRGLVVLGFPCNQFGHQLLKFLSALGIAHGGQNPGDS +>ENSP00000498820.1 pep chromosome:GRCh38:3:49357506:49358278:-1 gene:ENSG00000233276.5 transcript:ENST00000651740.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN +ELQRRLGPRGLVVLGFPCNQFGHQLPFSPVGERQERRDSEFPQVRPAWWWVRAQLHALRE +VRGERCGGAPSLRLPAGGPASSQRRRHRAYDRPQAHHLVSGVSQRCCLEL +>ENSP00000499000.1 pep chromosome:GRCh38:3:49357826:49358278:-1 gene:ENSG00000233276.5 transcript:ENST00000651279.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN +ELQRRLGPRGLVVLGFPCNQFGHQLLKFLSALGIAHGGQNPGDS +>ENSP00000478837.1 pep chromosome:GRCh38:3:49357180:49358358:-1 gene:ENSG00000233276.5 transcript:ENST00000620890.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553] +MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLGTTVRDYTQMNE +LQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAG +AHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSR +RFQTIDIEPDIEALLSQGPSCA +>ENSP00000486888.2 pep chromosome:GRCh38:14:94364313:94366698:-1 gene:ENSG00000258597.3 transcript:ENST00000553483.3 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:SERPINA2 description:serpin family A member 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:8985] +IPFSVSWGVLLLAGLCCLVPSSLVEDPQEDAAQKTDTSHHDQGDWEDLACQKISYNVTDL +AFDLYKELADLSQTSNVLVTPTSVAMAFAMLSLGTKADTRTEILEGLNVNLTETPEAKIH +ECFQQVLQALSRPDTRLQLTTGSSLFVNKSMKLVDTFLEDTKKLYHSEASSINFRDTEEA +KEQINNYVEKRTGRKVVDLVKHLKKDTSLALVDYISFHGKWKDKFKAERIMVEGFHVDDK +TIIRVPMINHLGRFDIHRDRELSSWVLAQHYVGNATAFFILPDPKKMWQLEEKLTYSHLE +NIQRAFDIR* +>ENSP00000486005.1 pep chromosome:GRCh38:CHR_HSCHR17_2_CTG2:1201029:1203765:1 gene:ENSG00000280938.1 transcript:ENST00000626647.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:TRARG1 description:trafficking regulator of GLUT4 (SLC2A4) 1 [Source:HGNC Symbol;Acc:HGNC:29592] +XQKK +>ENSP00000434644.1 pep chromosome:GRCh38:16:81100889:81181324:-1 gene:ENSG00000166473.17 transcript:ENST00000533478.5 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715] +MGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRG +EVIRIRATALTRHAYGEDTYVISTVPPREVPACTIAPEEGTVLTSFAIFCNASTALGPLE +FCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVLTVVISATNRAGDTQQTQAMAKVALG +DTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAKAVSSMLNQEHESQGSGQSLSIDVRQ +KVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTCRSKELTPSAQWEASLALQHASEALL +TVSAKARPEDQRRQAATRDLFQAVGSVLEASLSNRPEEPAEASSSQIATVLRLLRVMEHV +QTTLLLGKLPGGLPAMLATPSISVYTNRIQPWSWQGSSLRPDAADSATFMLPAASSLSSL +EGGQEPVDIKIMSFPKSPFPARSHFDVSGTVGGLRVTSPSGQLIPVKNLSENIEILLPRH +SQRHSQPTVLNLTSPEALWVNVTSGEATLGIQLHWRPDIALTLSLGYGYHPNKSSYDAQT +HLVPMVAPDELPTWILSPQDLRFGEGVYYLTVVPESDLEPAPGRDLTVGITTFLSHCVFW +DEVQETWDDSGCQVGPRTSPYQTHCLCNHLTFFGSTFLVMSNAINIHQTAELFATFEDNP +VVVTTVGCLCVVYVLVVIWARRKDAQDQAKVKVTVLEDNDPFAQYHYLVTVYTGHRRGAA +TSSKVTVTLYGLDGEREPHHLADPDTPVFERGAVDAFLLSTLFPLGELRSLRLWHDNSGD +RPSWYVSRVLVYDLVMDRKWYFLCNSWLSINVGDCVLDKVFPVATEQDRKQFSHLFFMKT +SAGFQDGHIWYSIFSRCARSSFTRVQRVSCCFSLLLCTMLTSIMFWGVPKDPAEQKMDLG +KIEFTWQEVMIGLESSILMFPINLLIVQIFQNTRPRVAKEQNTGKWDRGSPNLTPSPQPM +EDGLLTPEAVTKDVSRIVSSLFKALKVPSPALGWDSVNLMDINSLLALVEDVIYPQNTSG +QVFWEEAKKREDPVTLTLGSSEMKEKSQCPKPKAARSGPWKDSAYRQCLYLQLEHVEQEL +RLVGPRGFSQPHSHAQALRQLQTLKGGLGVQPGTWAPAHASALQVSKPPQGLPWWCILVG +WLLVAATSGVAAFFTMLYGLHYGRASSLRWLISMAVSFVESMFVTQPLKVLGFAAFFALV +LKRVDDEEDTVAPLPGHLLGPDPYALFRARRNSSRDVYQPPLTAAIEKMKTTHLKEQKAF +ALIREILAYLGFLWMLLLVAYGQRDPSAYHLNRHLQHSFTRGFSGVLGFREFFKWANTTL +VSNLYGHPPGFITDGNSKLVGSAQIRQVRVQESSCPLAQQPQAYLNGCRAPYSLDAEDMA +DYGEGWNATTLSNGSTRARTNVKGIPSGANSLCTGEEATWSPWGLIAKARQEFSAISLTT +PGWTP*PELCLWSPLSTTPTSTCSALSR*R*RPALWAPFLRTRPCRASACTPSPTAGTPS +W*RQSSSTSSSSSTTWWCRASA*VKRRGAISAASGTFWSWPSSWPAGAPWRCL*RGLSWP +KGTSSAAGTTGRKASASVRQQQPMPPLATSLPSWYSCPQ*SFGICSG*IPK*T*SRQPYA +VPGATFQAL*LSSLPCSWLTPSRQT*YLVGNSVPTKPSLMRRRRWSAFSWESSTTRRSWT +IAQCLAPSSLDPALFL*HLWC*TCLSLSSWWPSVRSKNTISCRRKGRS*ICC**KYSVSW +ALSLRERSLEAAGSSLGLCPRLATLDQHKLCPRT +>ENSP00000434417.1 pep chromosome:GRCh38:16:81101123:81220370:-1 gene:ENSG00000166473.17 transcript:ENST00000525539.5 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715] +MSAVGLVLLVLALRLRATTVKPEEGSFCSNSQVAFRDACYEFVPLGRTFRDAQSWCEGQG +GHLVFIQDEGTQWFLQKHISQDREWWIGLTWNLARNGTTEGPGTWLDTSNVTYSNWHGGQ +AAAAPDTCGHIGRGPSSEWVTSDCAQTFAFMCEFRVGQSLACEGLNATVHCGLGQVIQVQ +DAVYGRQNPHFCTQDAGRPSDLEQGCSWANVKEEVAGQCQELQSCQVAADETYFGNLCPT +QGSYLWVQYQCREALQLMVSSESFIFDNVTISLTWLLSPYIGNLSCIISTGDSHTFDPYN +PPSVSSNVTHQFTSPGEFTVFAECTTSEWHVTAQRQVTVRDKMETLSVTACSGLSQSGAG +PLCQAVFGDPLWIQVELDGGTGVTYTVLLGDITLAESTTQKGSLPYNLILDRETQKLMGP +GRHRLEIQATGNTTTSTISRNITVHLVELLSGLQASWASDHLELGQDLLITISLAQGTPE +ELTFEVAGLNATFSHEQVSFGEPFGICRLAVPVEGTFLVTMLVRNAFSNLSLEIGNITIT +APSGLQEPSGMNAEGKSKDKGDMEVYIQPGPYVDPFTTVTLGWPDNDKELRFQWSCGSCW +ALWSSCVERQLLRTDQRELVVPASCLPPPDSAVTLRLAVLRGQELENRAEQCLYVSAPWE +LRPRVSCERNCRPVNASKDILLRVTMGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFW +PRSLTLLQSNTSTLLLNSSFLQSRGEVIRIRATALTRHAYGEDTYVISTVPPREVPACTI +APEEGTVLTSFAIFCNASTALGPLEFCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVL +TVVISATNRAGDTQQTQAMAKVALGDTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAK +AVSSMLNQEHESQGSGQSLSIDVRQKVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTC +RSKELTPSAQWEASLALQHASEALLTVSAKARPEDQRRQAATRDLFQAVGSVLEASLSNR +PEEPAEASSSQIATVLRLLRVMEHVQTTLLLGKLPGGLPAMLATPSISVYTNRIQPWSWQ +GSSLRPDAADSATFMLPAASSLSSLEGGQEPVDIKIMSFPKSPFPARSHFDVSGTVGGLR +VTSPSGQLIPVKNLSENIEILLPRHSQRHSQPTVLNLTSPEALWVNVTSGEATLGIQLHW +RPDIALTLSLGYGYHPNKSSYDAQTHLVPMVAPDELPTWILSPQDLRFGEGVYYLTVVPE +SDLEPAPGRDLTVGITTFLSHCVFWDEVQETWDDSGCQVGPRTSPYQTHCLCNHLTFFGS +TFLVMSNAINIHQTAELFATFEDNPVVVTTVGCLCVVYVLVVIWARRKDAQDQAKVKVTV +LEDNDPFAQYHYLVTVYTGHRRGAATSSKVTVTLYGLDGEREPHHLADPDTPVFERGAVD +AFLLSTLFPLGELRSLRLWHDNSGDRPSWYVSRVLVYDLVMDRKWYFLCNSWLSINVGDC +VLDKVFPVATEQDRKQFSHLFFMKTSAGFQDGHIWYSIFSRCARSSFTRVQRVSCCFSLL +LCTMLTSIMFWGVPKDPAEQKMDLGKIEFTWQEVMIGLESSILMFPINLLIVQIFQNTRP +RVAKEQNTGKWDRGSPNLTPSPQPMEDGLLTPEAVTKDVSRIVSSLFKALKVPSPALGWD +SVNLMDINSLLALVEDVIYPQNTSGQVFWEEAKKREDPVTLTLGSSEMKEKSQCPKPKAA +RSGPWKDSAYRQCLYLQLEHVEQELRLVGPRGFSQPHSHAQALRQLQTLKGGLGVQPGTW +APAHASALQVSKPPQGLPWWCILVGWLLVAATSGVAAFFTMLYGLHYGRASSLRWLISMA +VSFVESMFVTQPLKVLGFAAFFALVLKRVDDEEDTVAPLPGHLLGPDPYALFRARRNSSR +DVYQPPLTAAIEKMKTTHLKEQKAFALIREILAYLGFLWMLLLVAYGQRDPSAYHLNRHL +QHSFTRGFSGVLGFREFFKWANTTLVSNLYGHPPGFITDGNSKLVGSAQIRQVRVQESSC +PLAQQPQAYLNGCRAPYSLDAEDMADYGEGWNATTLSNGSTRARTNVKGIPSGANSLCTG +EEATWSPWGLIAKARQEFSAISLTTPGWTP*PELCLWSPLSTTPTSTCSALSR*R*RPAL +WAPFLRTRPCRASACTPSPTAGTPSW*RQSSSTSSSSSTTWWCRASA*VKRRGAISAASG +TFWSWPSSWPAGAPWRCL*RGLSWPKGTSSAAGTTGRKASASVRQQQPMPPLATSLPSWY +SCPQ*SFGICSG*IPK*T*SRQPYAVPGATFQAL*LSSLPCSWLTPSRQT*YLVGNSVPT +KPSLMRRRRWSAFSWESSTTRRSWTIAQCLAPSSLDPALFL*HLWC*TCLSLSSWWPSVR +SKNTISCRRKGRS*ICC**KYSVSWALSLRERSLEAAGSSLGLCPRLATLDQHKLCPRT +>ENSP00000436309.1 pep chromosome:GRCh38:16:81170289:81181329:-1 gene:ENSG00000166473.17 transcript:ENST00000531391.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715] +MGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRG +EVIRIRATALTRHAYGEDTYVISTVPPREVPACTIAPEEGTVLTSFAIFCNASTALGPLE +FCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVLTVVISATNRAGDTQQTQAMAKVALG +DTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAKAVSSMLNQEHESQGSGQSLSIDVRQ +KVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTCRSKELTPSAQGSCMGDSWEGAPPAA +HVSHAR +>ENSP00000436389.1 pep chromosome:GRCh38:16:81170292:81198786:-1 gene:ENSG00000166473.17 transcript:ENST00000526632.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715] +XLAQGTPEELTFEVAGLNATFSHEQVSFGEPFGICRLAVPVEGTFLVTMLVRNAFSNLSL +EIGNITITAPSGLQEPSGMNAEGKSKDKGDMEVYIQPGPYVDPFTTVTLGWPDNDKELRF +QWSCGSCWALWSSCVERQLLRTDQRELVVPASCLPPPDSAVTLRLAVLRGQELENRAEQC +LYVSAPWELRPRVSCERNCRPVNASKDILLRVTMGEDSPVAMFSWYLDNTPTEQAEPLLD +ACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRGEVIRIRATALTRHAYGEDTYVISTVPP +REVPACTIAPEEGTVLTSFAIFCNASTALGPLEFCFCLESGSCLHCGPEPALPSVYLPLG +EENNDFVLTVVISATNRAGDTQQTQAMAKVALGDTCVEDVAFQAAVSEKIPTALQGEGGP +EQLLQLAKAVSSMLNQEHESQGSGQSLSIDVRQKVREHVLGSLSAVTTGLEDVQRVQELA +EVLREVTCRSKELTPSAQGSCMGDSWEGAPPAAHVSHAR +>ENSP00000432818.1 pep chromosome:GRCh38:16:81171215:81181327:-1 gene:ENSG00000166473.17 transcript:ENST00000527937.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715] +MGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRG +EVIRIRATGSCLHCGPEPALPSVYLPLGEENNDFVLTVVISATNRAGDTQQTQAMAKVAL +GDTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAKAVSSMLNQEHESQGSGQSLSIDVR +QKVPVGSWGAPFIPFLWGPRVCVRPFGLWIKVHGSGEKPVVSPKRLTPPPSLVFWVSDIK +>ENSP00000337397.4 pep chromosome:GRCh38:16:81170289:81220370:-1 gene:ENSG00000166473.17 transcript:ENST00000337114.8 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715] +MSAVGLVLLVLALRLRATTVKPEEGSFCSNSQVAFRDACYEFVPLGRTFRDAQSWCEGQG +GHLVFIQDEGTQWFLQKHISQDREWWIGLTWNLARNGTTEGPGTWLDTSNVTYSNWHGGQ +AAAAPDTCGHIGRGPSSEWVTSDCAQTFAFMCEFRVGQSLACEGLNATVHCGLGQVIQVQ +DAVYGRQNPHFCTQDAGRPSDLEQGCSWANVKEEVAGQCQELQSCQVAADETYFGNLCPT +QGSYLWVQYQCREALQLMVSSESFIFDNVTISLTWLLSPYIGNLSCIISTGDSHTFDPYN +PPSVSSNVTHQFTSPGEFTVFAECTTSEWHVTAQRQVTVRDKMETLSVTACSGLSQSGAG +PLCQAVFGDPLWIQVELDGGTGVTYTVLLGDITLAESTTQKGSLPYNLILDRETQKLMGP +GRHRLEIQATGNTTTSTISRNITVHLVELLSGLQASWASDHLELGQDLLITISLAQGTPE +ELTFEVAGLNATFSHEQVSFGEPFGICRLAVPVEGTFLVTMLVRNAFSNLSLEIGNITIT +APSGLQEPSGMNAEGKSKDKGDMEVYIQPGPYVDPFTTVTLGWPDNDKELRFQWSCGSCW +ALWSSCVERQLLRTDQRELVVPASCLPPPDSAVTLRLAVLRGQELENRAEQCLYVSAPWE +LRPRVSCERNCRPVNASKDILLRVTMGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFW +PRSLTLLQSNTSTLLLNSSFLQSRGEVIRIRATALTRHAYGEDTYVISTVPPREVPACTI +APEEGTVLTSFAIFCNASTALGPLEFCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVL +TVVISATNRAGDTQQTQAMAKVALGDTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAK +AVSSMLNQEHESQGSGQSLSIDVRQKVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTC +RSKELTPSAQGSCMGDSWEGAPPAAHVSHAR +>ENSP00000385765.6 pep chromosome:GRCh38:22:23980058:23983710:1 gene:ENSG00000099984.11 transcript:ENST00000402588.6 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GSTT2 description:glutathione S-transferase theta 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:4642] +MGLELFLDLVSQPSRAVYIFAKKNGIPLELRTVDLVKGQHKSKEFLQINSLGKLPTLKDG +DFILTESSAILIYLSCKYQTPDHWYPSDLQARARVHEYLGWHADCIRGTFGIPLWVQMLG +PLIGVQVPEEKVERNRTAMDQALQWLEDKFLGDRPFLAGQQPVALGYELFEGRPRLAAWR +G*VEAFLGAELCQEAHSIILSILEQAAKKTLPTPSPEAYQAMLLRIARIP +>ENSP00000488993.1 pep chromosome:GRCh38:22:23980058:23983915:1 gene:ENSG00000099984.11 transcript:ENST00000634759.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GSTT2 description:glutathione S-transferase theta 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:4642] +MGLELFLDLVSQPSRAVYIFAKKNGIPLELRTVDLVKGQHKSKEFLQINSLGKLPTLKDG +DFILTESSAILIYLSCKYQTPDHWYPSDLQARARVHEYLGWHADCIRGTFGIPLWVQMLG +PLIGVQVPEEKVERNRTAMDQALQWLEDKFLGDRPFLAGQQVTLADLMALEELMQPVALG +YELFEGRPRLAAWRG*VEAFLGAELCQEAHSIILSILEQAAKKTLPTPSPEAYQAMLLRI +ARIP +>ENSP00000481555.1 pep chromosome:GRCh38:22:23980123:23983911:1 gene:ENSG00000099984.11 transcript:ENST00000621118.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GSTT2 description:glutathione S-transferase theta 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:4642] +MGLELFLDLVSQPSRAVYIFAKKNGIPLELRTVDLVKGQHKSKEFLQINSLGKLPTLKDG +DFILTESSAILIYLSCKYQTPDHWYPSDLQARARVHEYLGWHADCIRGTFGIPLWVQMLG +PLIGVQVPEEKVERNRTAMDQALQWLEDKFLGDRPFLAGQQVTLADLMALEELMQPVALG +YELFEGRPRLAAWRGVEAFLGAELCQEAHSIILSILEQAAKKTLPTPSPEAYQAMLLRIA +RIP +>ENSP00000493452.1 pep chromosome:GRCh38:1:247770169:247779524:1 gene:ENSG00000228336.2 transcript:ENST00000446393.2 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR9H1P description:olfactory receptor family 9 subfamily H member 1 pseudogene [Source:HGNC Symbol;Acc:HGNC:15038] +MVNFTHVSEFVLLGFQGGPGMQAMLFLIFLILYGIAVVGNLGMIVIIWVDAHLHTPMYAF +LQSLSLLDICYSSTIAPRALANSMQEDHTISFGGCAAQFFFLSLFGITEAFLLAAMAYDR +FIAICNPLLYSVSMSHQVCVLLISGSYLWGVVNAIAQTTMTFRLPFCGSNEINDFFCDVP +PLLSLSCSDTFINQLVLLGLCGSIIVSTFLIVLVSYIYIISTILRIPTMQGR*KAFSTCA +SHLTGVCLFFGTVFFMYAQPSAIFFMEQSKIVSIFYTMVIPMLNPLIYSLRNKEVKQALR +RSMQKLSL +>ENSP00000493221.1 pep chromosome:GRCh38:1:248436359:248444316:1 gene:ENSG00000227152.6 transcript:ENST00000641557.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR2T7 description:olfactory receptor family 2 subfamily T member 7 [Source:HGNC Symbol;Acc:HGNC:15019] +MEQSNYSVYADFILLGLFSNARFPWLLCPHSPGLCDLHSQQRGQDHSHPHRLPPPHPHVL +PAQPALPQGHPVYFHHCAQNAGRPGDEPESHFLCWMHCPTLPLLDLSRG*VLPPRTHVL* +SLRSHLQPSALS*PHEPQDLLVDCGGSLAGRVYRWFLAHPRHHAVPLLCLSGDQPLLLRG +ACPSEALLHGHISLRDSHVCLLYYDAPHPFLCDLGLLHKNSHYCL*DERGRGEAKGCGHL +LLTHGGCQPLLWGCHVHIRAASFLPHP*AGQSCICLLHHPHSHAQSTHLQP*EQGCHGGP +TEGCWEVCVLRKGNHFL +>ENSP00000493243.1 pep chromosome:GRCh38:1:248439145:248444316:1 gene:ENSG00000227152.6 transcript:ENST00000641057.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR2T7 description:olfactory receptor family 2 subfamily T member 7 [Source:HGNC Symbol;Acc:HGNC:15019] +MEQSNYSVYADFILLGLFSNARFPWLLCPHSPGLCDLHSQQRGQDHSHPHRLPPPHPHVL +PAQPALPQGHPVYFHHCAQNAGRPGDEPESHFLCWMHCPTLPLLDLSRG*VLPPRTHVL* +SLRSHLQPSALS*PHEPQDLLVDCGGSLAGRVYRWFLAHPRHHAVPLLCLSGDQPLLLRG +ACPSEALLHGHISLRDSHVCLLYYDAPHPFLCDLGLLHKNSHYCL*DERGRGEAKGCGHL +LLTHGGCQPLLWGCHVHIRAASFLPHP*AGQSCICLLHHPHSHAQSTHLQP*EQGCHGGP +TEGCWEVCVLRKGNHFL +>ENSP00000475160.1 pep chromosome:GRCh38:3:98064472:98065396:1 gene:ENSG00000213439.3 transcript:ENST00000429239.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR5AC1 description:olfactory receptor family 5 subfamily AC member 1 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15047] +MAEENKILVTHFVLTGLTDHPGLQAPLFLVFLVIYLITLVGNLGLMALIWKDPHLHTPIY +LFLGSLAFADACTSSSVTSKMLINFFIKESYAIHG*VCHPVLLFWFQCNHRMLPAGSDGL +*PLCSHMQSLALSSGDVQ*PLYSVYRYFIFYWFSAFSDSCGFVI*INFLQVQYYTLFLL* +NFTAVQNFLHQSYS*YTSDFHLFSIYTSLHFYDSYRLLLLYSLCHPEKEV*EG*KQSLLY +LQCPSALCLFVLRHPLLHVCEF*VWISCRSGQNVFFILHNNNSFTKSFYLQPKEQRGYRC +PEKNHEEI +>ENSP00000476186.1 pep chromosome:GRCh38:3:98306752:98312843:1 gene:ENSG00000232535.2 transcript:ENST00000394191.3 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR5H8 description:olfactory receptor family 5 subfamily H member 8 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:14773] +MDDENATLLTEFVLTGLTYQSEWKIPLFLAFLVIYLITIMANLGLIAVIWKDSHLHIPMY +LFLGSLAFVDAWLSSSVTPKMLISFLAKSMIISVSECKIQFFSFGISGTTECFLLATMAY +DRYVAICKPLLYPVIMTNGLCIWLLVLSFIGGFLHALIHEGILFRLTFCNSNIIHHFYCD +IIPLLKISCTDPSINFLMLFILSGSIQVFTILTVLVSYTFVLFTILKKKVCQRHKESLFH +LWSPSLICFFILWPPSLHVCAPCISTSR*SRYGGVSILHCHNSFLKSHYLQPEK*ASHRF +TDKNIKRKCL +>ENSP00000492977.1 pep chromosome:GRCh38:17:74560701:74567343:-1 gene:ENSG00000284690.2 transcript:ENST00000641710.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:CD300H description:CD300H molecule (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:52292] +MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQP +CLPIWHEMVETGGSEGVVRSDQVIITDHPGDLTFTVTLENLTADDAGKYRCGIATILQED +GLSGFLPDPFFQVQVLVSSASSTENSVKTPASPTRPSQCQGSLPSSTCFLLLPLLKVPLL +LSILGAILWVNRPWRTPWTES +>ENSP00000492997.1 pep chromosome:GRCh38:17:74560781:74567343:-1 gene:ENSG00000284690.2 transcript:ENST00000641031.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:CD300H description:CD300H molecule (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:52292] +MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQP +CLPIWHEMVETGGSEGVVRSDQVIITDHPGDLTFTVTLENLTADDAGKYRCGIATILQED +GLSGFLPDPFFQVQVLVSSGPCPAAPASCFSHS +>ENSP00000498753.1 pep chromosome:GRCh38:17:74560781:74567343:-1 gene:ENSG00000284690.2 transcript:ENST00000651881.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:CD300H description:CD300H molecule (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:52292] +MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQP +CLPIWHEMVETGGSEGVVRSDQVIITDHPGDLTFTVTLENLTADDAGKYRCGIATILQED +GLSGFLPDPFFQVQVLVSSGPCPAAPASCFSHS diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.fa.gz b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.fa.gz new file mode 100644 index 00000000..db7dd483 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.fa.gz differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-11.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-11.bloomfilter.nodegraph new file mode 100644 index 00000000..287c4f91 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-11.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-12.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-12.bloomfilter.nodegraph new file mode 100644 index 00000000..049f2f46 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-12.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-5.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-5.bloomfilter.nodegraph new file mode 100644 index 00000000..b57b3ae0 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-5.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-6.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-6.bloomfilter.nodegraph new file mode 100644 index 00000000..c3869dc3 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-6.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-7.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-7.bloomfilter.nodegraph new file mode 100644 index 00000000..61ab111c Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-7.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-15.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-15.bloomfilter.nodegraph new file mode 100644 index 00000000..bf7d5b91 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-15.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-16.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-16.bloomfilter.nodegraph new file mode 100644 index 00000000..7fe2c05f Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-16.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-20.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-20.bloomfilter.nodegraph new file mode 100644 index 00000000..aa542aab Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-20.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-21.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-21.bloomfilter.nodegraph new file mode 100644 index 00000000..c8be607b Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-21.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-22.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-22.bloomfilter.nodegraph new file mode 100644 index 00000000..534ac1a7 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-22.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-23.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-23.bloomfilter.nodegraph new file mode 100644 index 00000000..493adf9b Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-23.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-25.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-25.bloomfilter.nodegraph new file mode 100644 index 00000000..39c67711 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-25.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-5.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-5.bloomfilter.nodegraph new file mode 100644 index 00000000..70534626 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-5.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-6.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-6.bloomfilter.nodegraph new file mode 100644 index 00000000..f6cd7951 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-6.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-7.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-7.bloomfilter.nodegraph new file mode 100644 index 00000000..a8d26947 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-7.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-5.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-5.bloomfilter.nodegraph new file mode 100644 index 00000000..813b953a Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-5.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-6.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-6.bloomfilter.nodegraph new file mode 100644 index 00000000..b3bbebae Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-6.bloomfilter.nodegraph differ diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph new file mode 100644 index 00000000..9ae4ab52 Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph differ diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-11.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-11.csv new file mode 100644 index 00000000..815573df --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-11.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,12.0,Coding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,13.0,Non-coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.0,12.0,Non-coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.16666666666666666,12.0,Non-coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.125,8.0,Non-coding +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,14.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.07692307692307693,13.0,Non-coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.1,10.0,Non-coding +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.0,12.0,Non-coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,12.0,Non-coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.07692307692307693,13.0,Non-coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.08333333333333333,12.0,Non-coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,12.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,12.0,Non-coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.07142857142857142,14.0,Non-coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,13.0,Non-coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in dayhoff encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-12.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-12.csv new file mode 100644 index 00000000..d85c2549 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-12.csv @@ -0,0 +1,23 @@ +,read_id,jaccard_in_peptide_db,n_kmers,classification +0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,11.0,Coding +1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.08333333333333333,12.0,Non-coding +2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.09090909090909091,11.0,Non-coding +3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.0,11.0,Non-coding +4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.0,7.0,Non-coding +5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,13.0,Coding +6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.0,12.0,Non-coding +7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.0,9.0,Non-coding +8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.09090909090909091,11.0,Non-coding +9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,11.0,Non-coding +10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.08333333333333333,12.0,Non-coding +11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.09090909090909091,11.0,Non-coding +13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,11.0,Coding +14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,11.0,Non-coding +15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,0.0,Read length was shorter than 3 * preptide k-mer size +18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.07692307692307693,13.0,Non-coding +19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,12.0,Non-coding +20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide +21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,6.0,Low complexity peptide diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-6.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-6.csv new file mode 100644 index 00000000..2c79d464 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-6.csv @@ -0,0 +1,23 @@ +,read_id,jaccard_in_peptide_db,n_kmers,classification +0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,16.0,Coding +1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,18.0,Coding +2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,17.0,Coding +3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,17.0,Coding +4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,13.0,Coding +5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,19.0,Coding +6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,18.0,Coding +7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,15.0,Coding +8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,17.0,Coding +9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.8823529411764706,17.0,Coding +10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,18.0,Coding +11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,16.0,Coding +13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,17.0,Coding +14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,17.0,Coding +15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,6.0,Coding +18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,19.0,Coding +19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,18.0,Coding +20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide +21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,6.0,Low complexity peptide diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-7.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-7.csv new file mode 100644 index 00000000..30ed369d --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-7.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,16.0,Coding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.9411764705882353,17.0,Coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,16.0,Coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,16.0,Coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,12.0,Coding +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,18.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.7647058823529411,17.0,Coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.7857142857142857,14.0,Coding +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,15.0,Coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.25,16.0,Non-coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,17.0,Coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,16.0,Coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,16.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.625,16.0,Coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,5.0,Coding +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.9444444444444444,18.0,Coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.9411764705882353,17.0,Coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in dayhoff encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-8.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-8.csv new file mode 100644 index 00000000..ea1e3a37 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-8.csv @@ -0,0 +1,23 @@ +,read_id,jaccard_in_peptide_db,n_kmers,classification +0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,15,coding +1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.8125,16,non-coding +2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.7333333333333333,15,non-coding +3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.7857142857142857,14,non-coding +4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,11,coding +5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,17,coding +6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.5,16,non-coding +7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.5384615384615384,13,non-coding +8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.7142857142857143,14,non-coding +9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.06666666666666667,15,non-coding +10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.8125,16,non-coding +11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,0.0,0,non-coding +12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.4666666666666667,15,non-coding +13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,15,coding +14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.13333333333333333,15,non-coding +15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,0.0,0,non-coding +16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,0.0,0,non-coding +17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,4,coding +18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.5882352941176471,17,non-coding +19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.625,16,non-coding +20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,-1.0,2,low complexity nucleotide +21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,-2.0,6,low complexity peptide diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-16.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-16.csv new file mode 100644 index 00000000..89acc80a --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-16.csv @@ -0,0 +1,23 @@ +,read_id,jaccard_in_peptide_db,n_kmers,classification +0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,6.0,Coding +1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,8.0,Coding +2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,7.0,Coding +3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,7.0,Coding +4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,3.0,Coding +5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,9.0,Coding +6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,8.0,Coding +7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,5.0,Coding +8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,7.0,Coding +9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,1.0,7.0,Coding +10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,8.0,Coding +11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,7.0,Coding +13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,7.0,Coding +14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,7.0,Coding +15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,0.0,Read length was shorter than 3 * preptide k-mer size +18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,9.0,Coding +19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,8.0,Coding +20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide +21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,1.0,Low complexity peptide diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-20.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-20.csv new file mode 100644 index 00000000..634de4f9 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-20.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,,2.0,Low complexity peptide in hydrophobic-polar encoding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,4.0,Coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.6666666666666666,3.0,Non-coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.6666666666666666,3.0,Non-coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,5.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.75,4.0,Non-coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,2.0,Coding +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.6666666666666666,3.0,Non-coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,3.0,Non-coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.75,4.0,Non-coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.6666666666666666,3.0,Non-coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,3.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.6666666666666666,3.0,Non-coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.6,5.0,Non-coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.25,4.0,Non-coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in hydrophobic-polar encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-21.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-21.csv new file mode 100644 index 00000000..288d511c --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-21.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,2.0,Coding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.6666666666666666,3.0,Non-coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.5,2.0,Non-coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.5,2.0,Non-coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,4.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.6666666666666666,3.0,Non-coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.5,2.0,Non-coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,2.0,Non-coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.3333333333333333,3.0,Non-coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.5,2.0,Non-coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,2.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.5,2.0,Non-coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.5,4.0,Non-coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,3.0,Non-coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in hydrophobic-polar encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-22.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-22.csv new file mode 100644 index 00000000..b80299c6 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-22.csv @@ -0,0 +1,23 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,2.0,Non-coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,,0.0,Read length was shorter than 3 * preptide k-mer size +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,3.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.5,2.0,Non-coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,,0.0,Read length was shorter than 3 * preptide k-mer size +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.0,2.0,Non-coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,,,All translations shorter than peptide k-mer size + 1 +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,0.0,Read length was shorter than 3 * preptide k-mer size +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,0.0,Read length was shorter than 3 * preptide k-mer size +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.3333333333333333,3.0,Non-coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,2.0,Non-coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,1.0,Low complexity peptide diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-7.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-7.csv new file mode 100644 index 00000000..b35d3d49 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-7.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,12.0,Coding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,17.0,Coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,16.0,Coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,16.0,Coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,12.0,Coding +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,18.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,17.0,Coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,13.0,Coding +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,16.0,Coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,1.0,15.0,Coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,15.0,Coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,9.0,Coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,16.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,15.0,Coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,5.0,Coding +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,16.0,Coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,16.0,Coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in hydrophobic-polar encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-8.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-8.csv new file mode 100644 index 00000000..a5e4645c --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-8.csv @@ -0,0 +1,23 @@ +,read_id,jaccard_in_peptide_db,n_kmers,classification +0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,12,coding +1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,16,coding +2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,15,coding +3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,15,coding +4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,11,coding +5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,17,coding +6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,16,coding +7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,13,coding +8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,15,coding +9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,1.0,15,coding +10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,15,coding +11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,0.0,0,non-coding +12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,10,coding +13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,15,coding +14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,15,coding +15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,0.0,0,non-coding +16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,0.0,0,non-coding +17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,4,coding +18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,16,coding +19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,16,coding +20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,-1.0,2,low complexity nucleotide +21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,-2.0,7,low complexity peptide diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-6.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-6.csv new file mode 100644 index 00000000..e7a27a76 --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-6.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,17.0,Coding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.05555555555555555,18.0,Non-coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.17647058823529413,17.0,Non-coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.0625,16.0,Non-coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.07692307692307693,13.0,Non-coding +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,19.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.05555555555555555,18.0,Non-coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.125,16.0,Non-coding +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.0625,16.0,Non-coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,17.0,Non-coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.16666666666666666,18.0,Non-coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.17647058823529413,17.0,Non-coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,17.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,17.0,Non-coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,0.16666666666666666,6.0,Non-coding +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.15789473684210525,19.0,Non-coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.05555555555555555,18.0,Non-coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in protein encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-7.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-7.csv new file mode 100644 index 00000000..1f9bdd5e --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-7.csv @@ -0,0 +1,24 @@ +read_id,jaccard_in_peptide_db,n_kmers,classification +SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,16.0,Coding +SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,17.0,Non-coding +SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.0,16.0,Non-coding +SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.0,16.0,Non-coding +SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.0,12.0,Non-coding +SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,18.0,Coding +SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.058823529411764705,17.0,Non-coding +SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.0,14.0,Non-coding +SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.0625,16.0,Non-coding +SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.125,16.0,Non-coding +SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.058823529411764705,17.0,Non-coding +SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons +SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.0625,16.0,Non-coding +SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,16.0,Coding +SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0625,16.0,Non-coding +SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons +SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons +SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,0.0,5.0,Non-coding +SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.1111111111111111,18.0,Non-coding +SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.17647058823529413,17.0,Non-coding +SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size +SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size +adversarial_low_complexity_peptide,,1.0,Low complexity peptide in protein encoding diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-8.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-8.csv new file mode 100644 index 00000000..6279d32a --- /dev/null +++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-8.csv @@ -0,0 +1,23 @@ +,read_id,jaccard_in_peptide_db,n_kmers,classification +0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,15,coding +1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,16,non-coding +2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.0,15,non-coding +3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.07142857142857142,14,non-coding +4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.0,11,non-coding +5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,17,coding +6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.0625,16,non-coding +7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.07142857142857142,14,non-coding +8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.07142857142857142,14,non-coding +9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.06666666666666667,15,non-coding +10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.125,16,non-coding +11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,0.0,0,non-coding +12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.06666666666666667,15,non-coding +13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,15,coding +14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,15,non-coding +15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,0.0,0,non-coding +16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,0.0,0,non-coding +17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,0.25,4,non-coding +18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.0,17,non-coding +19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,16,non-coding +20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,-1.0,2,low complexity nucleotide +21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,-2.0,7,low complexity peptide diff --git a/tests/data/extract_coding/true_protein_coding.fasta b/tests/data/extract_coding/true_protein_coding.fasta new file mode 100644 index 00000000..473153f2 --- /dev/null +++ b/tests/data/extract_coding/true_protein_coding.fasta @@ -0,0 +1,6 @@ +>SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1 translation_frame: -2 jaccard: 1.0 +TEQDLQLYCDFPNIIDVSIKQA +>SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1 translation_frame: -1 jaccard: 1.0 +QSSSPEFRVQSFSERTNARKKNNH +>SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1 translation_frame: 2 jaccard: 1.0 +LDPPYSRVITQRETENNQMTSE diff --git a/tests/data/low_complexity_nucleotides.fastq b/tests/data/low_complexity_nucleotides.fastq new file mode 100644 index 00000000..1413e5c6 --- /dev/null +++ b/tests/data/low_complexity_nucleotides.fastq @@ -0,0 +1,4 @@ +@SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1 +ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ++ +(04147:;:9<<:7;88<>=@>>8<;;<=;C;>;:5:;9<<::6@;E;?:C@=:9:67 diff --git a/tests/data/low_complexity_peptides.fastq b/tests/data/low_complexity_peptides.fastq new file mode 100644 index 00000000..e2e72994 --- /dev/null +++ b/tests/data/low_complexity_peptides.fastq @@ -0,0 +1,4 @@ +@SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1 +CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACACCC ++ +2263688B;9<<9;=;9<=><:;=:@<@<<;@;S5:;;M true_n_unique_kmers * 0.95 + assert test.n_unique_kmers() < true_n_unique_kmers * 1.05 + + +def test_maybe_make_peptide_bloom_filter(peptide_bloom_filter_path, + molecule, peptide_ksize): + from khtools.bloom_filter import maybe_make_peptide_bloom_filter + + maybe_make_peptide_bloom_filter(peptide_bloom_filter_path, + peptide_ksize, + molecule, + peptides_are_bloom_filter=True) + # No assertion, just check that it ran + # assert isinstance(test, khmer.Nodegraph) + + +def test_cli_minimum(peptide_fasta): + from khtools.bloom_filter import cli + + runner = CliRunner() + result = runner.invoke(cli, [ + peptide_fasta, + ]) + assert result.exit_code == 0 + + +def test_cli_options(peptide_fasta, molecule, peptide_ksize): + from khtools.bloom_filter import cli + + runner = CliRunner() + result = runner.invoke(cli, [ + '--peptide-ksize', peptide_ksize, '--molecule', molecule, + "--tablesize", "1e4", + peptide_fasta, + ]) + assert result.exit_code == 0 + + +def test_get_peptide_ksize_default(molecule): + from khtools.bloom_filter import get_peptide_ksize, \ + DEFAULT_PROTEIN_KSIZE, DEFAULT_HP_KSIZE, DEFAULT_DAYHOFF_KSIZE + + test = get_peptide_ksize(molecule, peptide_ksize=None) + if molecule == 'protein': + assert test == DEFAULT_PROTEIN_KSIZE + elif molecule == 'dayhoff': + assert test == DEFAULT_DAYHOFF_KSIZE + elif molecule == 'hydrophobic-polar': + assert test == DEFAULT_HP_KSIZE + + +def test_get_peptide_ksize_with_ksize(molecule): + from khtools.bloom_filter import get_peptide_ksize + + peptide_ksize = 123 + test = get_peptide_ksize(molecule, peptide_ksize) + assert test == peptide_ksize + + +def test_get_peptide_ksize_with_bad_molecule(): + from khtools.bloom_filter import get_peptide_ksize + + peptide_ksize = 123 + with pytest.raises(ValueError): + get_peptide_ksize("not a real molecule type", peptide_ksize) diff --git a/khtools/tests/test_commandline.py b/tests/test_commandline.py similarity index 99% rename from khtools/tests/test_commandline.py rename to tests/test_commandline.py index 421a3adb..dc64d88e 100755 --- a/khtools/tests/test_commandline.py +++ b/tests/test_commandline.py @@ -1,6 +1,5 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - """ test_commandline ---------------------------------- diff --git a/khtools/tests/test_compare_kmer_content.py b/tests/test_compare_kmer_content.py similarity index 91% rename from khtools/tests/test_compare_kmer_content.py rename to tests/test_compare_kmer_content.py index 0072f394..3ab5d7d8 100644 --- a/khtools/tests/test_compare_kmer_content.py +++ b/tests/test_compare_kmer_content.py @@ -66,9 +66,12 @@ def test_jaccardize(nucleotide_seq1): def test_kmer_comparison_table(nucleotide_seq1, nucleotide_seq2, ksizes): from khtools.compare_kmer_content import kmer_comparison_table - test = kmer_comparison_table('seq1', nucleotide_seq1, - 'seq2', nucleotide_seq2, - 'nucleotide', ksizes=ksizes) + test = kmer_comparison_table('seq1', + nucleotide_seq1, + 'seq2', + nucleotide_seq2, + 'nucleotide', + ksizes=ksizes) s = """id1,id2,ksize,jaccard,molecule seq1,seq2,2,1.0,nucleotide seq1,seq2,3,0.8,nucleotide diff --git a/khtools/tests/test_ensembl.py b/tests/test_ensembl.py similarity index 100% rename from khtools/tests/test_ensembl.py rename to tests/test_ensembl.py diff --git a/tests/test_extract_coding.py b/tests/test_extract_coding.py new file mode 100644 index 00000000..c6dcd87b --- /dev/null +++ b/tests/test_extract_coding.py @@ -0,0 +1,329 @@ +import os +import warnings + +from Bio.Seq import Seq +from click.testing import CliRunner +import pandas as pd +import pandas.util.testing as pdt +import pytest +import screed + + +@pytest.fixture +def seq(): + s = 'CGCTTGCTTAATACTGACATCAATAATATTAGGAAAATCGCAATATAACTGTAAATCCTGTTCTGTC' + with warnings.catch_warnings(): + # Ignore The following warning because we don't use Bio.Alphabet + # explicitly: + # PendingDeprecationWarning: We intend to remove or replace + # Bio.Alphabet in 2020, ideally avoid using it explicitly in your + # code. Please get in touch if you will be adversely affected by this. + # https://github.com/biopython/biopython/issues/2046 + warnings.simplefilter("ignore") + return Seq(s) + + +@pytest.fixture +def low_complexity_seq(): + return "CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCA" \ + "CACACCCCCAACACCC" + + +@pytest.fixture(params=['seq', 'low_complexity_seq']) +def type_seq(request, seq, low_complexity_seq): + if request.param == 'seq': + return request.param, seq + elif request.param == 'low_complexity_seq': + return request.param, low_complexity_seq + + +def test_three_frame_translation(seq): + from khtools.extract_coding import three_frame_translation + + test = [str(x) for x in three_frame_translation(seq)] + true = [ + 'RLLNTDINNIRKIAI*L*ILFC', 'ACLILTSIILGKSQYNCKSCSV', + 'LA*Y*HQ*Y*ENRNITVNPVL' + ] + assert test == true + + +def test_compute_fastp_low_complexity(type_seq): + from khtools.extract_coding import compute_fastp_complexity + + seqtype, seq = type_seq + test = compute_fastp_complexity(seq) + if seqtype == 'seq': + assert test == 0.746268656716418 + elif seqtype == 'low_complexity_seq': + assert test == 0.2631578947368421 + + +def test_evaluate_is_fastp_low_complexity(type_seq): + from khtools.extract_coding import evaluate_is_fastp_low_complexity + + seqtype, seq = type_seq + + test = evaluate_is_fastp_low_complexity(seq) + if seqtype == 'seq': + # regular sequence is not low complexity + assert not test + elif seqtype == 'low_complexity_seq': + # low complexity sequence should evaluate to low complexity! + assert test + + +def test_three_frame_translation_no_stops(seq): + from khtools.extract_coding import three_frame_translation_no_stops + + test = { + k: str(v) + for k, v in three_frame_translation_no_stops(seq).items() + } + true = {2: 'ACLILTSIILGKSQYNCKSCSV'} + assert test == true + + +def test_six_frame_translation_no_stops(seq): + from khtools.extract_coding import six_frame_translation_no_stops + + test = {k: str(v) for k, v in six_frame_translation_no_stops(seq).items()} + true = { + 2: 'ACLILTSIILGKSQYNCKSCSV', + -2: 'TEQDLQLYCDFPNIIDVSIKQA', + -3: 'QNRIYSYIAIFLILLMSVLSK' + } + assert test == true + + +@pytest.fixture +def reads(data_folder): + return os.path.join( + data_folder, + 'SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq') + + +@pytest.fixture +def true_scores_path(data_folder, molecule, peptide_ksize): + return os.path.join( + data_folder, "extract_coding", + "SRR306838_GSM752691_hsa_br_F_1_trimmed_" + f"subsampled_n22__molecule-{molecule}_ksize-" + f"{peptide_ksize}.csv") + + +@pytest.fixture +def true_scores(true_scores_path): + return pd.read_csv(true_scores_path) + + +@pytest.fixture +def true_protein_coding_fasta_path(data_folder): + return os.path.join(data_folder, "extract_coding", + "true_protein_coding.fasta") + + +@pytest.fixture +def true_protein_coding_fasta_string(true_protein_coding_fasta_path): + with open(true_protein_coding_fasta_path) as f: + return f.read() + + +def test_score_reads(capsys, tmpdir, reads, peptide_bloom_filter, molecule, + true_scores, true_scores_path, + true_protein_coding_fasta_path): + from khtools.extract_coding import score_reads + + test = score_reads(reads, + peptide_bloom_filter, + molecule=molecule) + + # Check that scoring was the same + pdt.assert_equal(test, true_scores) + + # --- Check fasta output --- # + captured = capsys.readouterr() + test_names = [] + for line in captured.out.splitlines(): + if line.startswith(">"): + test_names.append(line.lstrip('>')) + + # Check that the proper sequences were output + true_names = get_fasta_record_names(true_protein_coding_fasta_path) + + # Check that precision is high -- everything in "test" was truly coding + assert all(test_name in true_names for test_name in test_names) + + captured_lines = captured.out.splitlines() + with open(true_protein_coding_fasta_path) as f: + for true_line in f.readlines(): + assert true_line.strip() in captured_lines + + +def write_fasta_string_to_file(fasta_string, folder, prefix): + test_fasta_filename = os.path.join(folder, prefix + '.fasta') + with open(test_fasta_filename) as f: + f.write(fasta_string) + return test_fasta_filename + + +def get_fasta_record_names(fasta_path): + names = [] + for record in screed.open(fasta_path): + name = record['name'] + names.append(name) + return set(names) + + +def test_cli_peptide_fasta(reads, peptide_fasta, molecule, peptide_ksize, + true_protein_coding_fasta_string): + from khtools.extract_coding import cli + + runner = CliRunner() + result = runner.invoke(cli, [ + '--peptide-ksize', peptide_ksize, '--molecule', molecule, + peptide_fasta, reads + ]) + assert result.exit_code == 0 + assert true_protein_coding_fasta_string in result.output + + +def test_cli_bad_jaccard_threshold_float(reads, peptide_fasta): + from khtools.extract_coding import cli + + runner = CliRunner() + result = runner.invoke(cli, [ + "--jaccard-threshold", "3.14", peptide_fasta, reads + ]) + assert result.exit_code == 2 + error_message = 'Error: Invalid value for "--jaccard-threshold": ' \ + '--jaccard-threshold needs to be a number between 0 ' \ + 'and 1, but 3.14 was provided' + assert error_message in result.output + + +def test_cli_bad_jaccard_threshold_string(reads, peptide_fasta): + from khtools.extract_coding import cli + + runner = CliRunner() + result = runner.invoke(cli, [ + "--jaccard-threshold", "beyonce", peptide_fasta, reads + ]) + assert result.exit_code == 2 + error_message = 'Error: Invalid value for "--jaccard-threshold": beyonce' \ + ' is not a valid floating point value' + assert error_message in result.output + + +def test_cli_peptide_bloom_filter(reads, peptide_bloom_filter_path, molecule, + peptide_ksize, + true_protein_coding_fasta_string): + from khtools.extract_coding import cli + + runner = CliRunner() + result = runner.invoke(cli, [ + '--peptide-ksize', peptide_ksize, "--peptides-are-bloom-filter", + '--molecule', molecule, peptide_bloom_filter_path, reads + ]) + assert result.exit_code == 0 + assert true_protein_coding_fasta_string in result.output + + +def test_cli_csv(tmpdir, reads, peptide_bloom_filter_path, molecule, + peptide_ksize, true_protein_coding_fasta_string, true_scores): + from khtools.extract_coding import cli + + csv = os.path.join(tmpdir, 'coding_scores.csv') + + runner = CliRunner() + result = runner.invoke(cli, [ + '--peptide-ksize', peptide_ksize, "--csv", csv, + "--peptides-are-bloom-filter", '--molecule', molecule, + peptide_bloom_filter_path, reads + ]) + assert result.exit_code == 0 + assert true_protein_coding_fasta_string in result.output + assert os.path.exists(csv) + + # the CLI adds the filename to the scoring dataframe + true = true_scores.copy() + true['filename'] = reads + + test_scores = pd.read_csv(csv) + pdt.assert_equal(test_scores, true) + + +def test_cli_coding_nucleotide_fasta(tmpdir, reads, peptide_fasta): + from khtools.extract_coding import cli + + coding_nucleotide_fasta = os.path.join(tmpdir, 'coding_nucleotides.fasta') + + runner = CliRunner() + result = runner.invoke(cli, [ + "--coding-nucleotide-fasta", coding_nucleotide_fasta, + peptide_fasta, reads + ]) + assert result.exit_code == 0 + assert os.path.exists(coding_nucleotide_fasta) + + +def test_cli_noncoding_fasta(tmpdir, reads, peptide_fasta): + from khtools.extract_coding import cli + + noncoding_nucleotide_fasta = os.path.join(tmpdir, + 'noncoding_nucleotides.fasta') + + runner = CliRunner() + result = runner.invoke(cli, [ + "--noncoding-nucleotide-fasta", noncoding_nucleotide_fasta, + peptide_fasta, reads + ]) + assert result.exit_code == 0 + assert os.path.exists(noncoding_nucleotide_fasta) + + +def test_cli_low_complexity_nucleotide(tmpdir, reads, peptide_fasta): + from khtools.extract_coding import cli + + low_complexity_nucleotide_fasta = os.path.join( + tmpdir, 'low_complexity_nucleotide.fasta') + + runner = CliRunner() + result = runner.invoke(cli, [ + "--low-complexity-nucleotide-fasta", low_complexity_nucleotide_fasta, + peptide_fasta, reads + ]) + assert result.exit_code == 0 + assert os.path.exists(low_complexity_nucleotide_fasta) + + +def test_cli_low_complexity_peptide( + tmpdir, + reads, + peptide_fasta): + from khtools.extract_coding import cli + + low_complexity_peptide_fasta = os.path.join(tmpdir, + 'low_complexity_peptide.fasta') + + runner = CliRunner() + result = runner.invoke(cli, [ + "--low-complexity-peptide-fasta", low_complexity_peptide_fasta, + peptide_fasta, reads + ]) + assert result.exit_code == 0 + assert os.path.exists(low_complexity_peptide_fasta) + + +def test_cli_json_summary(tmpdir, reads, peptide_fasta): + from khtools.extract_coding import cli + + json_summary = os.path.join(tmpdir, 'coding_summary.json') + + runner = CliRunner() + result = runner.invoke(cli, [ + "--json-summary", json_summary, + peptide_fasta, reads + ]) + assert result.exit_code == 0 + assert os.path.exists(json_summary) diff --git a/khtools/tests/test_os_utils.py b/tests/test_os_utils.py similarity index 100% rename from khtools/tests/test_os_utils.py rename to tests/test_os_utils.py diff --git a/khtools/tests/test_sequence_encodings.py b/tests/test_sequence_encodings.py similarity index 87% rename from khtools/tests/test_sequence_encodings.py rename to tests/test_sequence_encodings.py index 7a3c2059..6a6b8dc8 100644 --- a/khtools/tests/test_sequence_encodings.py +++ b/tests/test_sequence_encodings.py @@ -61,6 +61,7 @@ def test_purine_pyrimidize(nucleotide_string): true = 'RRYYRYR' assert test == true + # -------------------- Test peptide encodings ---------------------------- # @@ -94,3 +95,16 @@ def test_botvinnikize(peptide_string): test = botvinnikize(peptide_string) true = 'dadkacbfghf' assert test == true + + +def test_encode_peptide(peptide_string, molecule): + from khtools.sequence_encodings import encode_peptide + + test = encode_peptide(peptide_string, molecule) + if molecule == 'dayhoff': + true = 'bbbdbfecdac' + elif molecule == 'hydrophobic-polar': + true = 'phpphhhpppp' + elif molecule == 'protein': + true = peptide_string + assert test == true