diff --git a/.gitignore b/.gitignore
index 9660ab63..4ab68438 100644
--- a/.gitignore
+++ b/.gitignore
@@ -276,3 +276,4 @@ dmypy.json
# Pyre type checker
.pyre/
+*.nodegraph
diff --git a/README.md b/README.md
index ddc95c67..ce4e794f 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,8 @@
Kmer-hashing tools
================================
-[![image](https://img.shields.io/travis/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D.svg)](https://travis-ci.org/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D)
-
-
-[![codecov](https://codecov.io/gh/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D/branch/master/graph/badge.svg)](https://codecov.io/gh/%7B%7B%20cookiecutter.github_organization%20%7D%7D/%7B%7B%20cookiecutter.repo_name%20%7D%7D)
-
-[![image](https://img.shields.io/pypi/v/%7B%7B%20cookiecutter.repo_name%20%7D%7D.svg)](https://pypi.python.org/pypi/%7B%7B%20cookiecutter.repo_name%20%7D%7D)
-
+[![image](https://img.shields.io/travis/czbiohub/kh-tools.svg)](https://travis-ci.com/czbiohub/kh-tools)
+[![codecov](https://codecov.io/gh/czbiohub/kh-tools/branch/master/graph/badge.svg)](https://codecov.io/gh/czbiohub/kh-tools)
What is khtools?
-------------------------------------
@@ -23,25 +18,91 @@ Installation
To install this code, clone this github repository and use pip to install
```
-git clone czbiohub/khtools.git
-cd khtools
+git clone czbiohub/khtools.git
+cd khtools
# The "." means "install *this*, the folder where I am now"
-pip install .
+pip install .
```
Usage
-----
-Greet a name multiple times!
+### Extract likely protein-coding reads from sequencing data
+
+
+```
+khtools extract_coding peptides.fa.gz *.fastq.gz > coding_peptides.fasta
+```
+
+#### Save the "coding scores" to a csv
+
+The "coding score" of each read is calculated by translating each read in six
+frames, then is calculatating the
+[Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) between any of the
+six translated frames of the read and the peptide database. The final coding
+score is the maximum Jaccard index across all reading frames. If you'd like to
+see the coding scores for all reads, use the `--csv` flag.
```
-$ Kmer-hashing tools hello --name "Rosalind Franklin" --count 10
+khtools extract_coding --csv coding_scores.csv peptides.fa.gz *.fastq.gz > coding_peptides.fasta
```
-Features
---------
+#### Save the coding nucleotides to a fasta
+
+By default, only the coding *peptides* are output. If you'd like to also output
+the underlying *nucleotide* sequence, then use the flag `--coding-nucleotide-fasta`
+
+```
+khtools extract_coding --coding-nucleotide-fasta coding_nucleotides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta
+```
-- TODO
+#### Save the *non*-coding nucleotides to a fasta
+
+To see the sequence of reads which were deemed non-coding, use the flag
+`--noncoding-nucleotide-fasta`.
+
+```
+khtools extract_coding --noncoding-nucleotide-fasta noncoding_nucleotides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta
+```
+
+#### Save the low complexity nucleotides to a fasta
+
+To see the sequence of reads found to have too low complexity of nucleotide
+sequence to evaluate, use the flag `--low-complexity-nucleotide-fasta`. Low
+complexity is determined by the same method as the read trimmer
+[fastp](https://github.com/OpenGene/fastp) in which we calculate what
+percentage of the sequence has consecutive runs of the same base,
+or mathematically, how often `seq[i] = seq[i+1]`. The default threshold is
+`0.3`. As an example, the sequence `CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACACCC`
+would be considered low complexity. While this sequence has many nucleotide
+k-mers, it is likely a result of a sequencing error and we ignore it.
+
+```
+khtools extract_coding --low-complexity-nucleotide-fasta low_complexity_nucleotides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta
+```
+
+#### Save the low complexity peptides to a fasta
+
+Even if the nucleotide sequence may pass the complexity filter, the peptide
+sequence may still be low complexity. As an example, all translated frames of
+the sequence
+`CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG`
+would be considered low complexity, as it translates to either
+`QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ` (5'3' Frame 1),
+`SSSSSSSSSSSSSSSSSSSSSSSSSSSSS` (5'3' Frame 2),
+`AAAAAAAAAAAAAAAAAAAAAAAAAAAAA` (5'3' Frame 3 and 3'5' Frame 3),
+`LLLLLLLLLLLLLLLLLLLLLLLLLLLLLL` (3'5' Frame 1),
+or `CCCCCCCCCCCCCCCCCCCCCCCCCCCCC` (3'5' Frame 2). As these sequences have few
+k-mers and are difficult to assess for how "coding" they are, we ignore them.
+Unlike for nucleotides where we look at runs of consecutive bases, we require
+the translated peptide to contain greater than `(L - k + 1)/2` k-mers, where
+`L` is the length of the sequence and `k` is the k-mer size. To save the
+sequence of low-complexity peptides to a fasta, use the flag
+`--low-complexity-peptides-fasta`.
+
+```
+khtools extract_coding --low-complexity-peptides-fasta low_complexity_peptides.fasta peptides.fa.gz *.fastq.gz > coding_peptides.fasta
+```
diff --git a/docs/usage.rst b/docs/usage.rst
index 5b26480a..e1e20d40 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -5,3 +5,15 @@ Usage
To use Kmer-hashing tools in a project::
import khtools
+
+To create a bloom filter of sequences::
+
+ khtools bloom-filter --molecule protein --peptide-ksize 7 --save-as Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph Homo_sapiens.GRCh38.pep.subset.fa.gz
+
+To partition reads into coding/noncoding bins using the bloom filter::
+
+ khtools partition -- SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled.fq.gz Homo_sapiens.GRCh38.pep.all.fa.gz
+
+To create the bloom filter and partition the reads in one step::
+
+ khtools partition ~/code/kmer-hashing/extract_kmers/test-data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled.fq.gz ~/Downloads/Homo_sapiens.GRCh38.pep.all.fa.gz
diff --git a/khtools/bloom_filter.py b/khtools/bloom_filter.py
new file mode 100644
index 00000000..7195f027
--- /dev/null
+++ b/khtools/bloom_filter.py
@@ -0,0 +1,253 @@
+import math
+import os
+
+import click
+import khmer
+import screed
+from sourmash._minhash import hash_murmur
+from tqdm import tqdm
+
+from khtools.compare_kmer_content import kmerize
+from khtools.sequence_encodings import encode_peptide, VALID_PEPTIDE_MOLECULES
+
+# khmer Nodegraph features
+DEFAULT_N_TABLES = 4
+DEFAULT_MAX_TABLESIZE = int(1e8)
+
+# Default k-mer sizes for different alphabets
+DEFAULT_PROTEIN_KSIZE = 7
+DEFAULT_DAYHOFF_KSIZE = 11
+DEFAULT_HP_KSIZE = 21
+
+
+def per_read_false_positive_coding_rate(n_kmers_in_read, n_total_kmers=1e7,
+ n_hash_functions=DEFAULT_N_TABLES,
+ tablesize=DEFAULT_MAX_TABLESIZE):
+ exponent = - n_hash_functions * n_total_kmers / tablesize
+ print(f"exponent: {exponent}")
+
+ # Probability that a single k-mer is randomly in the data
+ # per_kmer_fpr = math.pow(1 - math.exp(exponent), n_hash_functions)
+
+ # Use built-in `exp1m` = exp - 1
+ # - (exp - 1) = 1 - exp
+ per_kmer_fpr = math.pow(- math.expm1(exponent), n_hash_functions)
+ print(f"per kmer false positive rate: {per_kmer_fpr}")
+
+ # Probability that the number of k-mers in the read are all false positives
+ per_read_fpr = math.pow(per_kmer_fpr, n_kmers_in_read)
+ return per_read_fpr
+
+
+def load_nodegraph(*args, **kwargs):
+ try:
+ # khmer 2.1.1
+ return khmer.load_nodegraph(*args, **kwargs)
+ except AttributeError:
+ # khmer 3+/master branch
+ return khmer.Nodegraph.load(*args, **kwargs)
+
+
+# Cribbed from https://click.palletsprojects.com/en/7.x/parameters/
+class BasedIntParamType(click.ParamType):
+ name = "integer"
+
+ def convert(self, value, param, ctx):
+ try:
+ if isinstance(value, int):
+ return value
+ if 'e' in value:
+ sigfig, exponent = value.split('e')
+ sigfig = float(sigfig)
+ exponent = int(exponent)
+ return int(sigfig * 10 ** exponent)
+ return int(value, 10)
+ except TypeError:
+ self.fail(
+ "expected string for int() conversion, got "
+ f"{value!r} of type {type(value).__name__}",
+ param,
+ ctx,
+ )
+ except ValueError:
+ self.fail(f"{value!r} is not a valid integer", param, ctx)
+
+
+BASED_INT = BasedIntParamType()
+
+
+def make_peptide_bloom_filter(peptide_fasta,
+ peptide_ksize,
+ molecule,
+ n_tables=DEFAULT_N_TABLES,
+ tablesize=DEFAULT_MAX_TABLESIZE):
+ """Create a bloom filter out of peptide sequences"""
+ peptide_bloom_filter = khmer.Nodegraph(peptide_ksize,
+ tablesize,
+ n_tables=n_tables)
+
+ with screed.open(peptide_fasta) as records:
+ for record in tqdm(records):
+ if '*' in record['sequence']:
+ continue
+ sequence = encode_peptide(record['sequence'], molecule)
+ try:
+ kmers = kmerize(sequence, peptide_ksize)
+ for kmer in kmers:
+ # Convert the k-mer into an integer
+ hashed = hash_murmur(kmer)
+
+ # .add can take the hashed integer so we can hash the
+ # peptide kmer and add it directly
+ peptide_bloom_filter.add(hashed)
+ except ValueError:
+ # Sequence length is smaller than k-mer size
+ continue
+ return peptide_bloom_filter
+
+
+def make_peptide_set(peptide_fasta, peptide_ksize, molecule):
+ """Create a python set out of peptide sequence k-mers
+
+ For comparing to the bloom filter in storage and performance
+ """
+ peptide_set = set([])
+
+ with screed.open(peptide_fasta) as records:
+ for record in tqdm(records):
+ if '*' in record['sequence']:
+ continue
+ sequence = encode_peptide(record['sequence'], molecule)
+ try:
+ kmers = kmerize(sequence, peptide_ksize)
+ peptide_set.update(kmers)
+ except ValueError:
+ # Sequence length is smaller than k-mer size
+ continue
+ return peptide_set
+
+
+def maybe_make_peptide_bloom_filter(peptides, peptide_ksize, molecule,
+ peptides_are_bloom_filter,
+ n_tables=DEFAULT_N_TABLES,
+ tablesize=DEFAULT_MAX_TABLESIZE):
+ if peptides_are_bloom_filter:
+ click.echo(
+ f"Loading existing bloom filter from {peptides} and "
+ f"making sure the ksizes match",
+ err=True)
+ peptide_bloom_filter = load_nodegraph(peptides)
+ if peptide_ksize is not None:
+ try:
+ assert peptide_ksize == peptide_bloom_filter.ksize()
+ except AssertionError:
+ raise ValueError(f"Given peptide ksize ({peptide_ksize}) and "
+ f"ksize found in bloom filter "
+ f"({peptide_bloom_filter.ksize()}) are not"
+ f"equal")
+ else:
+ peptide_ksize = get_peptide_ksize(molecule, peptide_ksize)
+ click.echo(
+ f"Creating peptide bloom filter with file: {peptides}\n"
+ f"Using ksize: {peptide_ksize} and molecule: {molecule} "
+ f"...",
+ err=True)
+ peptide_bloom_filter = make_peptide_bloom_filter(
+ peptides, peptide_ksize, molecule=molecule,
+ n_tables=n_tables, tablesize=tablesize)
+ return peptide_bloom_filter
+
+
+def maybe_save_peptide_bloom_filter(peptides, peptide_bloom_filter, molecule,
+ save_peptide_bloom_filter):
+ if save_peptide_bloom_filter:
+ ksize = peptide_bloom_filter.ksize()
+
+ if isinstance(save_peptide_bloom_filter, str):
+ filename = save_peptide_bloom_filter
+ peptide_bloom_filter.save(save_peptide_bloom_filter)
+ else:
+ suffix = f'.molecule-{molecule}_ksize-{ksize}.bloomfilter.' \
+ f'nodegraph'
+ filename = os.path.splitext(peptides)[0] + suffix
+
+ click.echo(f"Writing peptide bloom filter to {filename}", err=True)
+ peptide_bloom_filter.save(filename)
+ click.echo("\tDone!", err=True)
+
+
+@click.command()
+@click.argument('peptides')
+@click.option('--peptide-ksize',
+ default=None, type=int,
+ help="K-mer size of the peptide sequence to use. Defaults for"
+ " different molecules are, "
+ f"protein: {DEFAULT_PROTEIN_KSIZE}"
+ f", dayhoff: {DEFAULT_DAYHOFF_KSIZE},"
+ f" hydrophobic-polar: {DEFAULT_HP_KSIZE}")
+@click.option('--molecule',
+ default='protein',
+ help="The type of amino acid encoding to use. Default is "
+ "'protein', but 'dayhoff' or 'hydrophobic-polar' can be "
+ "used")
+@click.option('--save-as',
+ default=None,
+ help='If provided, save peptide bloom filter as this filename. '
+ 'Otherwise, add ksize and molecule name to input filename.')
+@click.option('--tablesize', type=BASED_INT,
+ default="1e8",
+ help='Size of the bloom filter table to use')
+@click.option('--n-tables', type=int,
+ default=DEFAULT_N_TABLES,
+ help='Size of the bloom filter table to use')
+def cli(peptides, peptide_ksize=None, molecule='protein', save_as=None,
+ tablesize=DEFAULT_MAX_TABLESIZE, n_tables=DEFAULT_N_TABLES):
+ """Make a peptide bloom filter for your peptides
+
+ \b
+ Parameters
+ ----------
+ reads : str
+ Sequence file of reads to filter
+ peptides : str
+ Sequence file of peptides
+ peptide_ksize : int
+ Number of characters in amino acid words
+ long_reads
+ verbose
+
+ \b
+ Returns
+ -------
+
+ """
+ # \b above prevents rewrapping of paragraph
+ peptide_ksize = get_peptide_ksize(molecule, peptide_ksize)
+ peptide_bloom_filter = make_peptide_bloom_filter(peptides, peptide_ksize,
+ molecule,
+ n_tables=n_tables,
+ tablesize=tablesize)
+ click.echo("\tDone!", err=True)
+
+ save_peptide_bloom_filter = save_as if save_as is not None else True
+ maybe_save_peptide_bloom_filter(
+ peptides,
+ peptide_bloom_filter,
+ molecule,
+ save_peptide_bloom_filter=save_peptide_bloom_filter)
+
+
+def get_peptide_ksize(molecule, peptide_ksize):
+ if molecule not in VALID_PEPTIDE_MOLECULES:
+ raise ValueError(f"{molecule} is not a valid protein encoding! "
+ f"Only one of 'protein', 'hydrophobic-polar', or"
+ f" 'dayhoff' can be specified")
+
+ if peptide_ksize is None:
+ if molecule == 'protein':
+ peptide_ksize = DEFAULT_PROTEIN_KSIZE
+ elif molecule == 'dayhoff':
+ peptide_ksize = DEFAULT_DAYHOFF_KSIZE
+ elif molecule == 'hydrophobic-polar' or molecule == 'hp':
+ peptide_ksize = DEFAULT_HP_KSIZE
+ return peptide_ksize
diff --git a/khtools/commandline.py b/khtools/commandline.py
index f5054a5e..c1802081 100644
--- a/khtools/commandline.py
+++ b/khtools/commandline.py
@@ -10,25 +10,29 @@
import click
# Within-module imports
-from khtools.hello import hello
-
+from khtools.extract_coding import cli as extract_coding
+from khtools.bloom_filter import cli as bloom_filter
click.option = partial(click.option, show_default=True)
settings = dict(help_option_names=['-h', '--help'])
-@click.group(options_metavar='', subcommand_metavar='',
+
+@click.group(options_metavar='',
+ subcommand_metavar='',
context_settings=settings)
def cli():
"""
Kmer hashing tools contains data cleaning and visualization code for
+ analyzing sequencing datasets at the k-mer level
+ Kmer hashing tools contains data cleaning and visualization code for
analyzing kmer-hashing similarity matrices
"""
pass
-cli.add_command(hello, name='hello')
-
+cli.add_command(extract_coding, name='extract-coding')
+cli.add_command(bloom_filter, name='bloom-filter')
if __name__ == "__main__":
cli()
diff --git a/khtools/extract_coding.py b/khtools/extract_coding.py
new file mode 100644
index 00000000..53a6313d
--- /dev/null
+++ b/khtools/extract_coding.py
@@ -0,0 +1,680 @@
+"""
+extract_coding.py
+
+Partition reads into coding, noncoding, and low-complexity bins
+"""
+import json
+import sys
+import warnings
+
+from Bio.Seq import Seq
+import click
+import numpy as np
+import pandas as pd
+import screed
+from sourmash._minhash import hash_murmur
+from khtools.sequence_encodings import encode_peptide
+from khtools.compare_kmer_content import kmerize
+from khtools.bloom_filter import (maybe_make_peptide_bloom_filter,
+ maybe_save_peptide_bloom_filter,
+ DEFAULT_PROTEIN_KSIZE,
+ DEFAULT_DAYHOFF_KSIZE, DEFAULT_HP_KSIZE,
+ DEFAULT_N_TABLES, DEFAULT_MAX_TABLESIZE,
+ BASED_INT)
+from tqdm import tqdm
+
+# Import modified 'os' module with LC_LANG set so click doesn't complain.
+# The '# noqa: F401' line prevents the linter from complaining about the unused
+# import.
+DEFAULT_JACCARD_THRESHOLD = 0.5
+DEFAULT_HP_JACCARD_THRESHOLD = 0.8
+SEQTYPE_TO_ANNOUNCEMENT = {
+ "noncoding_nucleotide":
+ "nucleotide sequence from reads WITHOUT matches to "
+ "protein-coding peptides",
+ "coding_nucleotide":
+ "nucleotide sequence from reads WITH protein-coding translation"
+ " frame nucleotides",
+ "low_complexity_nucleotide":
+ "nucleotide sequence from low complexity (low entropy) reads",
+ "low_complexity_peptide":
+ "peptide sequence from low "
+ "complexity (low entropy) translated"
+ " reads"
+}
+SCORING_DF_COLUMNS = [
+ 'read_id', 'jaccard_in_peptide_db', 'n_kmers', 'classification'
+]
+
+
+def validate_jaccard(ctx, param, value):
+ """Ensure Jaccard threshold is between 0 and 1"""
+ if value is None:
+ return value
+ try:
+ jaccard = float(value)
+ assert jaccard <= 1
+ assert jaccard >= 0
+ return jaccard
+ except (ValueError, AssertionError):
+ raise click.BadParameter(f'--jaccard-threshold needs to be a number'
+ f' between 0 and 1, but {value} was provided')
+
+
+def write_fasta(file_handle, description, sequence):
+ file_handle.write(f">{description}\n{sequence}\n")
+
+
+def open_and_announce(filename, seqtype, quiet=False):
+ if not quiet:
+ announcement = SEQTYPE_TO_ANNOUNCEMENT[seqtype]
+ click.echo(f"Writing {announcement} to {filename}", err=True)
+ return open(filename, 'w')
+
+
+def three_frame_translation(seq, debug=False):
+ if debug:
+ warning_filter = 'default'
+ else:
+ warning_filter = 'ignore'
+
+ with warnings.catch_warnings():
+ warnings.simplefilter(warning_filter)
+ for frame in range(3):
+ translation = seq[frame:].translate()
+ yield translation
+
+
+def three_frame_translation_no_stops(seq, debug=False, sign=1):
+ """Remove translations with stop codons & keep track of reading frame"""
+ return {
+ sign * (i + 1): t
+ for i, t in enumerate(three_frame_translation(seq, debug))
+ if '*' not in t
+ }
+
+
+def six_frame_translation_no_stops(seq, debug=False):
+ forward_translations = three_frame_translation_no_stops(seq, debug)
+
+ # Sign=-1 sets the reading frames as negative to make it obvious they are
+ # from the reverse strand
+ reverse_translations = three_frame_translation_no_stops(
+ seq.reverse_complement(), debug, sign=-1)
+ forward_translations.update(reverse_translations)
+ return forward_translations
+
+
+def score_single_translation(translation,
+ peptide_bloom_filter,
+ peptide_ksize,
+ molecule='protein',
+ verbose=True):
+ encoded = encode_peptide(translation, molecule)
+ kmers = list(set(kmerize(str(encoded), peptide_ksize)))
+ hashes = [hash_murmur(kmer) for kmer in kmers]
+ n_kmers = len(kmers)
+ n_kmers_in_peptide_db = sum(1 for h in hashes
+ if peptide_bloom_filter.get(h) > 0)
+ if verbose > 1:
+ click.echo(f"\ttranslation: \t{encoded}", err=True)
+ click.echo("\tkmers:", ' '.join(kmers), err=True)
+
+ if verbose > 1:
+ kmers_in_peptide_db = {(k, h): peptide_bloom_filter.get(h)
+ for k, h in zip(kmers, hashes)}
+ # Print keys (kmers) only
+ click.echo(f"\tK-mers in peptide database:", err=True)
+ click.echo(kmers_in_peptide_db, err=True)
+
+ fraction_in_peptide_db = n_kmers_in_peptide_db / n_kmers
+
+ return fraction_in_peptide_db, n_kmers
+
+
+def evaluate_is_fastp_low_complexity(seq, complexity_threshold=0.3):
+ """Use fastp's definition of complexity
+
+ By this definition, low complexity sequence is defined by consecutive runs
+ of same base in a row, e.g.
+ CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACAC
+ is low complexity. The threshold is 0.3 as used in the fastp prpject:
+ https://github.com/OpenGene/fastp
+
+ Parameters
+ ----------
+ seq : str
+ Sequence to compute complexity on
+ complexity_threshold : float, defaault 0.3
+ Value between 0 and 1. The default is 0.3 because that is the default
+ in the command line program fastp
+
+ Returns
+ -------
+ is_low_complexity : bool
+ Whether or not the sequence passes the complexity threshold
+ """
+ complexity = compute_fastp_complexity(seq)
+ return complexity < complexity_threshold
+
+
+def compute_fastp_complexity(seq):
+ n_different_consecutively = sum(1 for i in range(len(seq) - 1)
+ if seq[i] != seq[i + 1])
+ complexity = n_different_consecutively / len(seq)
+ return complexity
+
+
+def evaluate_is_kmer_low_complexity(sequence, ksize):
+ """Check if sequence is low complexity, i.e. mostly repetitive
+
+ By this definition, the sequence is not complex if its number of unique
+ k-mers is smaller than half the number of expected k-mers
+ """
+ with warnings.catch_warnings():
+ warnings.filterwarnings('ignore')
+ # Ignore Biopython warning of seq objects being strings now
+ try:
+ kmers = kmerize(sequence, ksize)
+ except ValueError:
+ # k-mer size is larger than sequence
+ return None, None
+ n_kmers = len(kmers)
+ n_possible_kmers_on_sequence = len(sequence) - ksize + 1
+ min_kmer_entropy = n_possible_kmers_on_sequence / 2
+ is_low_complexity = n_kmers <= min_kmer_entropy
+ return is_low_complexity, n_kmers
+
+
+def score_single_read(sequence,
+ peptide_bloom_filter,
+ peptide_ksize,
+ molecule='protein',
+ verbose=True,
+ jaccard_threshold=0.9,
+ description=None,
+ noncoding_file_handle=None,
+ coding_nucleotide_file_handle=None,
+ low_complexity_peptide_file_handle=None):
+ """Predict whether a nucleotide sequence could be protein-coding
+
+ Parameters
+ ----------
+ sequence : str
+ Nucleotide sequence to predict on
+ peptide_bloom_filter : khmer.Nodegraph
+ Database of known peptide k-mers from a well-studied organism, e.g.
+ human protein-coding sequences. Must have been built on peptides using
+ the same k-mer size and molecular encoding as specified here, otherwise
+ the results will make no sense
+ peptide_ksize : int
+ Length of the peptide words in sequence. Must match the k-mer size used
+ for the peptide_bloom_filter otherwise nothing will match, or only
+ false positives will match.
+ molecule : str
+ One of "protein"|"peptide", "dayhoff", or "hydrophobic-polar"|"hp" to
+ encode the protein-coding space. Where "protein"|"peptide" is the
+ original 20-letter amino acid encoding, Dayhoff ("dayhoff") is a lossy
+ 6-letter encoding that categorizes the amino acids into:
+ 1. Cysteine,
+ 2. Small (A, G, P, S, T)
+ 3. Acid and Amide (D, E, N, Q)
+ 4. Basic (H, K, R)
+ 5. Hydrophobic (I, L, M, V)
+ 6. Aromatic (F, W, Y)
+ Hydrophobic-polar maps to a mere two categories:
+ 1. Hydrophobic (A, F, G, I, L, M, P, V, W, Y)
+ 2. Polar (C, D, E, H, K, N, Q, R, S, T)
+ verbose : bool
+ Whether or not to print a lot of stuff
+ jaccard_threshold : float
+ Value between 0 and 1. By default, the (empirically-chosen) "best"
+ threshold is chosen for each molecule. For "protein" and "dayhoff",
+ the default is 0.5, and for "hydrophobic-polar," it is 0.8, since it is
+ so lossy it's more likely to match random sequence. These thresholds
+ were determined empirically with a pre-chosen human RNA-seq dataset and
+ human peptides.
+ description : str
+ The identifier in the sequence file, i.e. the name or descriptor of the
+ sequence
+ noncoding_file_handle : None or file
+ If not None, write noncoding nucleotide reads to this file handle
+ coding_nucleotide_file_handle : None or file
+ If not None, write coding nucleotides reads to this file handle
+ low_complexity_peptide_file_handle : None or file
+ If not None, write low complexity peptide sequences to this file handle
+
+ Returns
+ -------
+ max_fraction_in_peptide_db : float
+ Of all reading frames, the maximum number of k-mers that matches the
+ peptide database
+ max_n_kmers: int
+ Of all reading frames, the maximum number of k-mers observed in the
+ translated, encoded peptide
+ special_case : str or None
+ Additional message to write in the output csv describing the reason
+ why this sequence is or isn't protein-coding
+ """
+ # Convert to BioPython sequence object for translation
+ seq = Seq(sequence)
+
+ # In case this is used from the Python API and the default threshold isn't
+ # specified
+ jaccard_threshold = get_jaccard_threshold(jaccard_threshold, molecule)
+
+ # Convert to BioPython sequence object for translation
+ translations = six_frame_translation_no_stops(seq)
+ # For all translations, use the one with the maximum number of k-mers
+ # in the databse
+ max_n_kmers = 0
+ max_fraction_in_peptide_db = 0
+ if len(translations) == 0:
+ return np.nan, np.nan, "No translation frames without stop codons"
+
+ translations = {
+ frame: translation
+ for frame, translation in translations.items()
+ if len(translation) > peptide_ksize
+ }
+ if len(translations) == 0:
+ return np.nan, np.nan, "All translations shorter than peptide k-mer " \
+ "size + 1"
+
+ for frame, translation in translations.items():
+ # Convert back to string
+ translation = str(translation)
+
+ # Maybe reencode to dayhoff/hp space
+ encoded = encode_peptide(translation, molecule)
+
+ is_kmer_low_complexity, n_kmers = evaluate_is_kmer_low_complexity(
+ encoded, peptide_ksize)
+
+ if is_kmer_low_complexity:
+ maybe_write_fasta(description + f" translation_frame: {frame}",
+ low_complexity_peptide_file_handle, translation)
+ return np.nan, n_kmers, f"Low complexity peptide in {molecule}" \
+ " encoding"
+
+ fraction_in_peptide_db, n_kmers = score_single_translation(
+ encoded,
+ peptide_bloom_filter,
+ peptide_ksize,
+ molecule=molecule,
+ verbose=verbose)
+
+ # Save the highest jaccard
+ max_fraction_in_peptide_db = max(max_fraction_in_peptide_db,
+ fraction_in_peptide_db)
+
+ if max_fraction_in_peptide_db == fraction_in_peptide_db:
+ # Update n_kmers if this is the best translation frame
+ max_n_kmers = n_kmers
+ if fraction_in_peptide_db > jaccard_threshold:
+ if verbose:
+ click.echo(f"\t{translation} is above {jaccard_threshold}",
+ err=True)
+ seqname = f'{description} translation_frame: {frame} ' \
+ f'jaccard: {fraction_in_peptide_db}'
+ write_fasta(sys.stdout, seqname, translation)
+ maybe_write_fasta(seqname, coding_nucleotide_file_handle, sequence)
+
+ if max_fraction_in_peptide_db <= jaccard_threshold:
+ maybe_write_fasta(description, noncoding_file_handle, sequence)
+ return max_fraction_in_peptide_db, max_n_kmers, None
+
+
+def maybe_write_fasta(description, file_handle, sequence):
+ """Write fasta to file handle if it is not None"""
+ if file_handle is not None:
+ write_fasta(file_handle, description, sequence)
+
+
+def score_reads(reads,
+ peptide_bloom_filter,
+ jaccard_threshold=None,
+ molecule='protein',
+ verbose=False,
+ coding_nucleotide_fasta=None,
+ noncoding_nucleotide_fasta=None,
+ low_complexity_nucleotide_fasta=None,
+ low_complexity_peptide_fasta=None):
+ """Assign a coding score to each read. Where the magic happens."""
+ jaccard_threshold = get_jaccard_threshold(jaccard_threshold, molecule)
+ peptide_ksize = peptide_bloom_filter.ksize()
+
+ scoring_lines = []
+ nucleotide_ksize = 3 * peptide_ksize
+
+ fastas, file_handles = maybe_open_fastas(coding_nucleotide_fasta,
+ low_complexity_nucleotide_fasta,
+ low_complexity_peptide_fasta,
+ noncoding_nucleotide_fasta)
+ with screed.open(reads) as records:
+ for record in tqdm(records):
+ description = record['name']
+ sequence = record['sequence']
+ if verbose:
+ print(description)
+
+ jaccard, n_kmers, special_case = maybe_score_single_read(
+ description, fastas, file_handles, jaccard_threshold, molecule,
+ nucleotide_ksize, peptide_bloom_filter, peptide_ksize,
+ sequence, verbose)
+
+ line = get_coding_score_line(description, jaccard,
+ jaccard_threshold, n_kmers,
+ special_case)
+ scoring_lines.append(line)
+
+ maybe_close_files(file_handles)
+
+ # Concatenate all the lines into a single dataframe
+ scoring_df = pd.DataFrame(scoring_lines, columns=SCORING_DF_COLUMNS)
+ return scoring_df
+
+
+def get_jaccard_threshold(jaccard_threshold, molecule):
+ if jaccard_threshold is None:
+ if molecule == 'hp' or molecule == 'hydrophobic-polar':
+ jaccard_threshold = DEFAULT_HP_JACCARD_THRESHOLD
+ else:
+ jaccard_threshold = DEFAULT_JACCARD_THRESHOLD
+ return jaccard_threshold
+
+
+def maybe_score_single_read(description, fastas, file_handles,
+ jaccard_threshold, molecule, nucleotide_ksize,
+ peptide_bloom_filter, peptide_ksize, sequence,
+ verbose):
+ """Check if read is low complexity/too short, otherwise score it"""
+ # Check if nucleotide sequence is low complexity
+ is_fastp_low_complexity = evaluate_is_fastp_low_complexity(sequence)
+ if is_fastp_low_complexity:
+ n_kmers = np.nan
+ jaccard, n_kmers, special_case = too_short_or_low_complexity(
+ description, fastas, n_kmers, sequence)
+ else:
+ jaccard, n_kmers, special_case = score_single_read(
+ sequence,
+ peptide_bloom_filter,
+ peptide_ksize,
+ molecule,
+ verbose,
+ jaccard_threshold=jaccard_threshold,
+ description=description,
+ noncoding_file_handle=file_handles['noncoding_nucleotide'],
+ coding_nucleotide_file_handle=file_handles['coding_nucleotide'],
+ low_complexity_peptide_file_handle=file_handles[
+ 'low_complexity_peptide'])
+
+ if verbose > 1:
+ click.echo(f"Jaccard: {jaccard}, n_kmers = {n_kmers}", err=True)
+ return jaccard, n_kmers, special_case
+
+
+def too_short_or_low_complexity(description, fastas, n_kmers, sequence):
+ if n_kmers > 0:
+ jaccard = np.nan
+ special_case = "Low complexity nucleotide"
+ maybe_write_fasta(description, fastas['low_complexity_nucleotide'],
+ sequence)
+ else:
+ jaccard = np.nan
+ n_kmers = np.nan
+ special_case = 'Read length was shorter than 3 * peptide ' \
+ 'k-mer size'
+ return jaccard, n_kmers, special_case
+
+
+def maybe_close_files(file_handles):
+ for file_handle in file_handles.values():
+ if file_handle is not None:
+ file_handle.close()
+
+
+def get_coding_score_line(description, jaccard, jaccard_threshold, n_kmers,
+ special_case):
+ if special_case is not None:
+ line = [description, jaccard, n_kmers, special_case]
+ elif jaccard > jaccard_threshold:
+ line = [description, jaccard, n_kmers, 'Coding']
+ else:
+ line = [description, jaccard, n_kmers, 'Non-coding']
+ return line
+
+
+def maybe_open_fastas(coding_nucleotide_fasta, low_complexity_nucleotide_fasta,
+ low_complexity_peptide_fasta,
+ noncoding_nucleotide_fasta):
+ fastas = {
+ "noncoding_nucleotide": noncoding_nucleotide_fasta,
+ "coding_nucleotide": coding_nucleotide_fasta,
+ "low_complexity_nucleotide": low_complexity_nucleotide_fasta,
+ "low_complexity_peptide": low_complexity_peptide_fasta
+ }
+ file_handles = {}
+ for seqtype, fasta in fastas.items():
+ if fasta is not None:
+ file_handles[seqtype] = open_and_announce(fasta, seqtype)
+ else:
+ file_handles[seqtype] = None
+ return fastas, file_handles
+
+
+def maybe_write_csv(coding_scores, csv):
+ if csv:
+ click.echo(f"Writing coding scores of reads to {csv}", err=True)
+ coding_scores.to_csv(csv, index=False)
+
+
+def maybe_write_json_summary(coding_scores, json_summary):
+ if json_summary:
+ classification_value_counts = \
+ coding_scores.classification.value_counts()
+ classification_percentages = 100 * classification_value_counts / \
+ classification_value_counts.sum()
+
+ metadata = {
+ 'jaccard_info':
+ coding_scores.jaccard_in_peptide_db.describe().to_dict(),
+ 'classification_value_counts':
+ classification_value_counts.to_dict(),
+ 'classification_percentages':
+ classification_percentages.to_dict()
+ }
+ with open(json_summary, 'w') as f:
+ click.echo(f"Writing extract_coding summary to {json_summary}")
+ json.dump(metadata, fp=f)
+
+
+@click.command()
+@click.argument('peptides', nargs=1)
+@click.argument('reads', nargs=-1)
+@click.option('--peptide-ksize',
+ default=None,
+ help="K-mer size of the peptide sequence to use. Defaults for"
+ " different molecules are, "
+ f"protein: {DEFAULT_PROTEIN_KSIZE}"
+ f", dayhoff: {DEFAULT_DAYHOFF_KSIZE},"
+ f" hydrophobic-polar: {DEFAULT_HP_KSIZE}")
+@click.option("--save-peptide-bloom-filter",
+ is_flag=True,
+ default=False,
+ help="If specified, save the peptide bloom filter. "
+ "Default filename is the name of the fasta file plus a "
+ "suffix denoting the protein encoding and peptide ksize")
+@click.option('--peptides-are-bloom-filter',
+ is_flag=True,
+ default=False,
+ help="Peptide file is already a bloom filter")
+@click.option('--jaccard-threshold',
+ default=None, type=click.FLOAT, callback=validate_jaccard,
+ help="Minimum fraction of peptide k-mers from read in the "
+ "peptide database for this read to be called a "
+ f"'coding read'. Default: {DEFAULT_JACCARD_THRESHOLD} for"
+ f" protein and dayhoff encodings, and "
+ f"{DEFAULT_HP_JACCARD_THRESHOLD} for hydrophobic-polar "
+ f"(hp) encoding")
+@click.option('--molecule',
+ default='protein',
+ help="The type of amino acid encoding to use. Default is "
+ "'protein', but 'dayhoff' or 'hydrophobic-polar' can be "
+ "used")
+@click.option('--csv',
+ default=False,
+ help='Name of csv file to write with all sequence reads and '
+ 'their coding scores')
+@click.option('--json-summary',
+ default=False,
+ help='Name of json file to write summarization of coding/'
+ 'noncoding/other categorizations, the '
+ 'min/max/mean/median/stddev of Jaccard scores, and other')
+@click.option("--coding-nucleotide-fasta",
+ help="If specified, save the coding nucleotides to this file")
+@click.option("--noncoding-nucleotide-fasta",
+ help="If specified, save the noncoding nucleotides to this file")
+@click.option("--low-complexity-nucleotide-fasta",
+ help="If specified, save the low-complexity nucleotides to this"
+ " file")
+@click.option("--low-complexity-peptide-fasta",
+ help="If specified, save the low-complexity peptides to this "
+ "file")
+@click.option('--tablesize', type=BASED_INT,
+ default="1e8",
+ help='Size of the bloom filter table to use')
+@click.option('--n-tables', type=int,
+ default=DEFAULT_N_TABLES,
+ help='Size of the bloom filter table to use')
+@click.option("--long-reads",
+ is_flag=True,
+ help="If set, then only considers reading frames starting with "
+ "start codon (ATG) and ending in a stop codon "
+ "(TAG, TAA, TGA)")
+@click.option("--verbose", is_flag=True, help="Print more output")
+def cli(peptides,
+ reads,
+ peptide_ksize=None,
+ save_peptide_bloom_filter=True,
+ peptides_are_bloom_filter=False,
+ jaccard_threshold=None,
+ molecule='protein',
+ csv=False,
+ json_summary=False,
+ coding_nucleotide_fasta=None,
+ noncoding_nucleotide_fasta=None,
+ low_complexity_nucleotide_fasta=None,
+ low_complexity_peptide_fasta=None,
+ tablesize=DEFAULT_MAX_TABLESIZE, n_tables=DEFAULT_N_TABLES,
+ long_reads=False,
+ verbose=False):
+ """Writes coding peptides from reads to standard output
+
+ \b
+ Sane defaults for peptide_ksize for different peptide encodings:
+ - with "protein" or "peptide" --> --peptide-ksize = 5-10
+ 7 is pretty universal but can go down to 5 for less species specificity
+ and up to 10 to be very specific
+ - with "dayhoff" --> --peptide-ksize = 10-15
+ - with "hydrophobic-polar" or "hp" --> --peptide-ksize = 15-21
+ 15 is pretty good but can do up to 21
+
+ \b
+ Parameters
+ ----------
+ reads : str
+ Sequence file of reads to filter
+ peptides : str
+ Sequence file of peptides
+ peptide_ksize : int
+ Number of characters in amino acid words
+ save_peptide_bloom_filter : str or bool
+ Whether or not to save the created bloom filter to file. If a string,
+ save to this filename
+ peptides_are_bloom_filter : bool
+ Input ilfe of peptides is already a bloom filter
+ jaccard_threshold : float
+ Value between 0 and 1. By default, the (empirically-chosen) "best"
+ threshold is chosen for each molecule. For "protein" and "dayhoff",
+ the default is 0.5, and for "hydrophobic-polar," it is 0.8, since it is
+ so lossy it's more likely to match random sequence. These thresholds
+ were determined empirically with a pre-chosen human RNA-seq dataset and
+ human peptides.
+ molecule : str
+ One of "protein"|"peptide", "dayhoff", or "hydrophobic-polar"|"hp" to
+ encode the protein-coding space. Where "protein"|"peptide" is the
+ original 20-letter amino acid encoding, Dayhoff ("dayhoff") is a lossy
+ 6-letter encoding that categorizes the amino acids into:
+ 1. Cysteine,
+ 2. Small (A, G, P, S, T)
+ 3. Acid and Amide (D, E, N, Q)
+ 4. Basic (H, K, R)
+ 5. Hydrophobic (I, L, M, V)
+ 6. Aromatic (F, W, Y)
+ Hydrophobic-polar maps to a mere two categories:
+ 1. Hydrophobic (A, F, G, I, L, M, P, V, W, Y)
+ 2. Polar (C, D, E, H, K, N, Q, R, S, T)
+ csv : str
+ Save the coding scores as a csv to this file
+ long_reads : bool -- NOT IMPLEMENTED!!
+ Input sequencing reads are long reads. Not implemented, but the plan
+ is, instead of doing 6-frame translation as on the short reads, test
+ all ATG (start codon) to stop codon reading frames for the one(s) that
+ matches the known peptide database best. Unknown whether this requires
+ new thresholds
+ coding_nucleotide_fasta : None or str
+ If specified, save coding nucleotide sequence to this file
+ noncoding_nucleotide_fasta : None or str
+ If specified, save noncoding nucleotide sequence to this file
+ low_complexity_nucleotide_fasta : None or str
+ If specified, save low complexity nucleotide sequence to this file
+ low_complexity_peptide_fasta : None or str
+ If specified, save low complexity peptide sequence to this file
+ verbose : bool
+ Whether or not to print lots of stuff. Can specify multiple, e.g. -vv
+ if you really like having everything in stdout
+
+ \b
+ Returns
+ -------
+ coding_peptides : str
+ Outputs a fasta-formatted sequence of translated peptides
+ """
+ # \b above prevents re-wrapping of paragraphs
+
+ if long_reads:
+ raise NotImplementedError("Not implemented! ... yet :)")
+
+ peptide_bloom_filter = maybe_make_peptide_bloom_filter(
+ peptides, peptide_ksize, molecule, peptides_are_bloom_filter,
+ n_tables=n_tables, tablesize=tablesize)
+ click.echo("\tDone!", err=True)
+
+ if not peptides_are_bloom_filter:
+ maybe_save_peptide_bloom_filter(peptides, peptide_bloom_filter,
+ molecule, save_peptide_bloom_filter)
+
+ dfs = []
+ for reads_file in reads:
+ df = score_reads(
+ reads_file,
+ peptide_bloom_filter,
+ jaccard_threshold=jaccard_threshold,
+ molecule=molecule,
+ verbose=verbose,
+ coding_nucleotide_fasta=coding_nucleotide_fasta,
+ noncoding_nucleotide_fasta=noncoding_nucleotide_fasta,
+ low_complexity_nucleotide_fasta=low_complexity_nucleotide_fasta,
+ low_complexity_peptide_fasta=low_complexity_peptide_fasta)
+ df['filename'] = reads_file
+ dfs.append(df)
+
+ coding_scores = pd.concat(dfs, ignore_index=True)
+
+ maybe_write_csv(coding_scores, csv)
+ maybe_write_json_summary(coding_scores, json_summary)
+
+
+if __name__ == '__main__':
+ cli()
diff --git a/khtools/hello.py b/khtools/hello.py
deleted file mode 100644
index 7a949253..00000000
--- a/khtools/hello.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-This is an example subcommand for a git-like interface
-
-Cribbed from the Click documentation https://click.palletsprojects.com/en/7.x/
-"""
-import random
-
-import click
-from tqdm import tqdm
-
-# Import modified 'os' module with LC_LANG set so click doesn't complain.
-# The '# noqa: F401' line prevents the linter from complaining about the unused
-# import.
-from .os_utils import os # noqa: F401
-
-
-COLORS = 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan'
-
-
-@click.command()
-@click.option('--count', default=5, help='Number of greetings.')
-@click.option('--name', prompt='Your name',
- help='The person to greet.')
-def hello(count, name):
- """Simple program that greets NAME for a total of COUNT times, in color."""
- for x in tqdm(range(count)):
- # note that colorama.init() doesn't need to be called for the colors
- # to work
- click.echo(click.style('Hello %s!' % name, fg=random.choice(COLORS)))
diff --git a/khtools/jupyter_utils.py b/khtools/jupyter_utils.py
index cc07530a..c9889f58 100644
--- a/khtools/jupyter_utils.py
+++ b/khtools/jupyter_utils.py
@@ -4,7 +4,6 @@
import ipykernel
import requests
-
from requests.compat import urljoin
from notebook.notebookapp import list_running_servers
diff --git a/khtools/os_utils.py b/khtools/os_utils.py
index ab0ad26b..e9252ddc 100644
--- a/khtools/os_utils.py
+++ b/khtools/os_utils.py
@@ -1,7 +1,6 @@
import os
import subprocess
-
# Set input language USA unicode encoding setting
# Necessary because click assumes ascii input unless otherwise specified
# https://click.palletsprojects.com/en/7.x/python3/
@@ -60,7 +59,8 @@ def get_stdout_stderr_from_command(command):
lines : list
Newline-separated strings from output of command
"""
- result = subprocess.run(command, stdout=subprocess.PIPE,
+ result = subprocess.run(command,
+ stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout = decode(result.stdout)
stderr = decode(result.stderr)
diff --git a/khtools/sequence_encodings.py b/khtools/sequence_encodings.py
index eba15448..cf192869 100644
--- a/khtools/sequence_encodings.py
+++ b/khtools/sequence_encodings.py
@@ -1,5 +1,5 @@
DNA_ALPHABET = "A", "C", "G", "T"
-AMINO_ACID_SINGLE_LETTERS = "R", "H", "K", "D", "E", "S", "T", "N", "Q", "C",\
+AMINO_ACID_SINGLE_LETTERS = "R", "H", "K", "D", "E", "S", "T", "N", "Q", "C", \
"G", "P", "A", "V", "I", "L", "M", "F", "Y", "W"
DAYHOFF_MAPPING = {
"C": "a",
@@ -81,16 +81,16 @@
"Y": "h",
# Hydrophilic - polar
- "N": 'p',
"C": 'p',
- "S": "p",
- "T": "p",
- "D": "p",
+ "D": 'p',
"E": "p",
- "R": "p",
"H": "p",
"K": "p",
- "Q": "p"
+ "N": "p",
+ "Q": "p",
+ "R": "p",
+ "S": "p",
+ "T": "p"
}
BOTVINNIK_MAPPING = {
# Small and hydrophobic
@@ -131,24 +131,10 @@
"H": "k",
"P": "m"
}
-PURINE_PYRIMIDINE_MAPPING = {
- "A": "R",
- "C": "Y",
- "G": "R",
- "T": "Y"
-}
-AMINO_KETO_MAPPING = {
- "A": "M",
- "C": "M",
- "G": "K",
- "T": "K"
-}
-WEAK_STRONG_MAPPING = {
- "A": "W",
- "C": "S",
- "G": "S",
- "T": "W"
-}
+
+PURINE_PYRIMIDINE_MAPPING = {"A": "R", "C": "Y", "G": "R", "T": "Y"}
+AMINO_KETO_MAPPING = {"A": "M", "C": "M", "G": "K", "T": "K"}
+WEAK_STRONG_MAPPING = {"A": "W", "C": "S", "G": "S", "T": "W"}
AMINO_KETO_TRANSLATION = str.maketrans(AMINO_KETO_MAPPING)
WEAK_STRONG_TRANSLATION = str.maketrans(WEAK_STRONG_MAPPING)
PURINE_PYRIMIDINE_TRANSLATION = str.maketrans(PURINE_PYRIMIDINE_MAPPING)
@@ -158,8 +144,11 @@
BOTVINNIK_TRANSLATION = str.maketrans(BOTVINNIK_MAPPING)
+VALID_PEPTIDE_MOLECULES = 'protein', 'peptide', 'dayhoff', \
+ 'hydrophobic-polar', 'hp'
# Nucleic acid mappings
+
def amino_keto_ize(seq):
return seq.translate(AMINO_KETO_TRANSLATION)
@@ -187,3 +176,16 @@ def hpize(seq):
def botvinnikize(seq):
return seq.translate(BOTVINNIK_TRANSLATION)
+
+
+def encode_peptide(peptide_sequence, molecule):
+ if molecule == 'dayhoff':
+ return dayhoffize(peptide_sequence)
+ elif molecule == 'hydrophobic-polar' or molecule == 'hp':
+ return hpize(peptide_sequence)
+ elif molecule in VALID_PEPTIDE_MOLECULES:
+ return peptide_sequence
+ else:
+ raise ValueError(f"{molecule} is not a valid amino acid encoding, "
+ "only "
+ "{', '.join(VALID_PEPTIDE_MOLECULES} can be used")
diff --git a/khtools/tests/conftest.py b/khtools/tests/conftest.py
deleted file mode 100644
index 21f77e68..00000000
--- a/khtools/tests/conftest.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-
-import pytest
-
-
-"""
-conftest.py contains fixtures or functions-turned-variables that can be
-used in any test
-"""
-
-
-@pytest.fixture
-def data_folder():
- """Absolute path to where test data is stored"""
- return os.path.join(os.path.abspath(os.path.dirname(__file__)),
- './data')
diff --git a/khtools/tests/test_hello.py b/khtools/tests/test_hello.py
deleted file mode 100755
index 70b13577..00000000
--- a/khtools/tests/test_hello.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-test_hello
-----------------------------------
-
-Tests for `khtools` module.
-"""
-
-from click.testing import CliRunner
-
-
-def test_hello():
- from khtools.hello import hello
-
- runner = CliRunner()
- result = runner.invoke(hello, input="Rosalind Franklin")
-
- assert result.exit_code == 0
- assert result.output.count("Hello Rosalind Franklin") == 5
-
-
-def test_hello_name():
- from khtools.hello import hello
-
- runner = CliRunner()
- result = runner.invoke(hello, ["--name", "Rosalind"])
-
- assert result.exit_code == 0
- assert result.output.count("Hello Rosalind") == 5
-
-
-def test_hello_count():
- from khtools.hello import hello
-
- runner = CliRunner()
- result = runner.invoke(hello, ["--count", "10",
- "--name", "Rosalind"])
-
- assert result.exit_code == 0
- assert result.output.count("Hello Rosalind") == 10
diff --git a/setup.cfg b/setup.cfg
index 5e409001..38734a18 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,5 @@
[wheel]
universal = 1
+
+[yapf]
+based_on_style = pep8
diff --git a/khtools/tests/__init__.py b/tests/__init__.py
similarity index 100%
rename from khtools/tests/__init__.py
rename to tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..471ffd2d
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,94 @@
+import os
+
+import pytest
+"""
+conftest.py contains fixtures or functions-turned-variables that can be
+used in any test
+"""
+from khtools.bloom_filter import DEFAULT_PROTEIN_KSIZE, \
+ DEFAULT_DAYHOFF_KSIZE, DEFAULT_HP_KSIZE
+
+
+@pytest.fixture
+def data_folder():
+ """Absolute path to where test data is stored"""
+ return os.path.join(os.path.abspath(os.path.dirname(__file__)), './data')
+
+
+@pytest.fixture
+def peptide_fasta(data_folder):
+ filename = os.path.join(data_folder, 'bloom_filter',
+ 'Homo_sapiens.GRCh38.pep.subset.fa.gz')
+ return filename
+
+
+@pytest.fixture
+def adversarial_peptide_fasta(data_folder):
+ filename = os.path.join(data_folder, 'bloom_filter',
+ 'Homo_sapiens.GRCh38.pep.first1000lines.fa')
+ return filename
+
+
+@pytest.fixture(params=['normal', 'adversarial'])
+def variable_peptide_fasta(request, peptide_fasta, adversarial_peptide_fasta):
+ if request.param == 'normal':
+ return peptide_fasta
+ else:
+ return adversarial_peptide_fasta
+
+
+# Tie the molecule name to its default ksize to make sure we keep getting the
+# right sequences
+@pytest.fixture(params=[('protein', DEFAULT_PROTEIN_KSIZE),
+ ('dayhoff', DEFAULT_DAYHOFF_KSIZE),
+ pytest.param(('dayhoff', DEFAULT_PROTEIN_KSIZE),
+ marks=pytest.mark.xfail),
+ ('hydrophobic-polar', DEFAULT_HP_KSIZE),
+ pytest.param(
+ ('hydrophobic-polar', DEFAULT_PROTEIN_KSIZE),
+ marks=pytest.mark.xfail)],
+ ids=[
+ 'protein_default_ksize', 'dayhoff_default_ksize',
+ 'dayhoff_protein_ksize_xfail', 'hp_default_ksize',
+ 'hp_protein_ksize_xfail'
+])
+def molecule_ksize(request):
+ return request.param
+
+
+@pytest.fixture
+def peptide_ksize(molecule_ksize):
+ return molecule_ksize[1]
+
+
+@pytest.fixture
+def molecule(molecule_ksize):
+ return molecule_ksize[0]
+
+
+@pytest.fixture
+def peptide_bloom_filter_path(data_folder, molecule, peptide_ksize):
+ filename = os.path.join(
+ data_folder, 'bloom_filter',
+ f'Homo_sapiens.GRCh38.pep.subset.molecule-{molecule}_'
+ f'ksize-{peptide_ksize}.bloomfilter.nodegraph'
+ )
+ return filename
+
+
+@pytest.fixture
+def peptide_bloom_filter(peptide_bloom_filter_path, peptide_fasta, molecule,
+ peptide_ksize):
+ from khtools.bloom_filter import load_nodegraph
+ """Load bloom filter from path if exists, otherwise, make it"""
+ try:
+ return load_nodegraph(peptide_bloom_filter_path)
+ except (FileNotFoundError, OSError):
+ from khtools.bloom_filter import make_peptide_bloom_filter
+
+ bloom_filter = make_peptide_bloom_filter(peptide_fasta,
+ peptide_ksize,
+ molecule,
+ tablesize=1e6)
+ bloom_filter.save(peptide_bloom_filter_path)
+ return bloom_filter
diff --git a/khtools/tests/data/.gitkeep b/tests/data/.gitkeep
similarity index 100%
rename from khtools/tests/data/.gitkeep
rename to tests/data/.gitkeep
diff --git a/khtools/tests/data/ENSP00000354687.pkl b/tests/data/ENSP00000354687.pkl
similarity index 100%
rename from khtools/tests/data/ENSP00000354687.pkl
rename to tests/data/ENSP00000354687.pkl
diff --git a/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_first20.fq.gz b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_first20.fq.gz
new file mode 100644
index 00000000..5cd527fb
Binary files /dev/null and b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_first20.fq.gz differ
diff --git a/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq
new file mode 100644
index 00000000..9e8ced6e
--- /dev/null
+++ b/tests/data/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq
@@ -0,0 +1,92 @@
+@SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1
+CGCTTGCTTAATACTGACATCAATAATATTAGGAAAATCGCAATATAACTGTAAATCCTGTTCTGTC
++
+322/2415652337555776752654675357764447564646644378654364939545:;538
+@SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1
+TCTAGAATGTGAAATAACGTACTTCATGTGTCTTCTTACCAAAAATACCAACGATAAGGGGAAAAGCCATC
++
+-0226727656145464554477797863768459454565555453855566855376368750886647
+@SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1
+CAATCATCATCACTTTCTAATTCCAGAATATTTTCATCACCCCAAAAAGAAATCCTAAATCCATTAGC
++
+2//004684572653325355467595624554598657657663644:6433;675575:6936684
+@SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1
+CAAAAGTGAAGACCTCCCTGGGGTCTTCAAAGACAGCCTTTGCTCTCCATGTAGCCAATGGTGCTCT
++
+6022356557706648784564628446554755486554756596986885587756554286585
+@SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1
+GTAACCCTTAATTCCTCAGAAAAAATAGACAACATAGTGGAGTGGGATGGAGGAAC
++
+30332625662567434687545364375744766473573546655765668745
+@SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1
+CTGTGATTATTTTTTTTCCTAGCATTTGTTCTTTCAGAAAAGGACTGAACTCTAAATTCTGGACTTGAAGACTG
++
+5350:57437356354566558756576348866:7;6643836778525:85667688<86667477475556
+@SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1
+GTTTGATTCTTGACAATTTCTTCTGGAACAAGTCTTTCATATACATTAGACTGGTATCATTGAGTTCTGAGC
++
+21115386673726345758678888346647476567794718586294896426:777786746;88557
+@SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1
+CACAAGGCTAACATACACAATCTGTAACACGAGATGGATAGCACACACATATGACACAATTTC
++
+61452257562856465866687736155375878767359637353643596776677544;
+@SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1
+CTGTAGTGTGGTCCATTCCCAGACAGCAACATGCAAGAATAAGTTTACAATACACTCAGCCCTTCTG
++
+324/369683845456484777454966366774539647446284364472965687967666555
+@SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1
+TAGGAAAATAGTAATATTTGCAACTTATGAATGATAAGTCAGAAAAGTTACATGGAATGTTAAATTTT
++
+,/47363256812646557466668738876:78754746593546:554625679796776775856
+@SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1
+CTGATAGACTGAAACTGGTTTTGTTATTCTTAACGTTCTCCAGTCTGCACTCTGCTGTGCTGTCTGTGCTC
++
+33322463673034336624677544645544414254565543869624775658457667567637565
+@SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1
+TTAAGTTCTAGTCTGTGAGCACTTGTAGTTCAATAATCGTCATCTTCATCAGAGTCCATTACTTTTCTTCTGTTG
++
+-4215189658489557777668:7529368673668567987;:6;55877774;795647=768=87::4265
+@SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1
+AAATAAGGTAACATTTAACAATAATCTGATACACATAAATAGAGAAAGAGCAATTGATAAAGTAAATG
++
++/123653252555435435755265765346547543473736454675656677;5566:343676
+@SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1
+CTTGGATCCACCATACTCAAGAGTTATTACACAAAGGGAAACAGAAAATAACCAAATGACATCAGAA
++
+42065467567555352764747445673547754756442445353345577667875987:6676
+@SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1
+CCAACACATCCAATTTGTATTTTCTTAAATATGTGTTTCTTAGGTATCTAAGGATACATGAGCGAGCCC
++
+3/221611467247456247465754444476666257966444376:775879558866696749866
+@SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1
+ATCTCTTGTAATAATTCAACATATTCCCTGGCTATTAACTAATTTCCAAGCCTGAACTGTCA
++
++2211175462223332546843655755866666343355754885535757574685477
+@SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1
+TGAGGCAAACAGCAAGAGTAAGCAGTGTTACTTGCAGGTACTTTGGTTAATGTTGATTTAAATTTTCATG
++
++31546132745663636164653655113459674751345574646339858877:63747864:796
+@SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1
+CAATGCCGTGCCAGTGGAGACTGTTCTCGTATGCC
++
+7233774594<778467675:87567366425937
+@SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1
+CCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAAC
++
+3/032525448575344564775543445846735376486767:5786677655;5745767657556737
+@SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1
+CAGGAATGAGGCCCCGACTAAAATTCGCTGCAAAAGCCCAAAATCTAGTTAGCATAAATTCCTCAGACATG
++
+4/3525833147443336366546545547666638656745567547545878665656;8683:86795
+@SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1
+ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
++
+(04147:;:9<<:7;88<>=@>>8<;;<=;C;>;:5:;9<<::6@;E;?:C@=:9:67
+@SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1
+CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACACCC
++
+2263688B;9<<9;=;9<=><:;=:@<@<<;@;S5:;;MENSP00000488240.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142847306:142847317:1 gene:ENSG00000282253.1 transcript:ENST00000631435.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158]
+GTGG
+>ENSP00000451042.1 pep chromosome:GRCh38:14:22438547:22438554:1 gene:ENSG00000223997.1 transcript:ENST00000415118.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD1 description:T cell receptor delta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12254]
+EI
+>ENSP00000452494.1 pep chromosome:GRCh38:14:22449113:22449125:1 gene:ENSG00000228985.1 transcript:ENST00000448914.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD3 description:T cell receptor delta diversity 3 [Source:HGNC Symbol;Acc:HGNC:12256]
+TGGY
+>ENSP00000451515.1 pep chromosome:GRCh38:14:22439007:22439015:1 gene:ENSG00000237235.2 transcript:ENST00000434970.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD2 description:T cell receptor delta diversity 2 [Source:HGNC Symbol;Acc:HGNC:12255]
+PSY
+>ENSP00000487941.1 pep chromosome:GRCh38:7:142786213:142786224:1 gene:ENSG00000282431.1 transcript:ENST00000632684.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158]
+GTGG
+>ENSP00000418639.1 pep chromosome:GRCh38:14:105865551:105865561:-1 gene:ENSG00000236597.1 transcript:ENST00000439842.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD7-27 description:immunoglobulin heavy diversity 7-27 [Source:HGNC Symbol;Acc:HGNC:5518]
+LTG
+>ENSP00000420733.1 pep chromosome:GRCh38:14:105881034:105881053:-1 gene:ENSG00000211907.1 transcript:ENST00000390567.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-26 description:immunoglobulin heavy diversity 1-26 [Source:HGNC Symbol;Acc:HGNC:5485]
+GIVGAT
+>ENSP00000417751.1 pep chromosome:GRCh38:14:105881539:105881556:-1 gene:ENSG00000225825.1 transcript:ENST00000452198.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-25 description:immunoglobulin heavy diversity 6-25 [Source:HGNC Symbol;Acc:HGNC:5516]
+GYSSGY
+>ENSP00000419139.1 pep chromosome:GRCh38:14:105883903:105883922:-1 gene:ENSG00000211909.1 transcript:ENST00000390569.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-24 description:immunoglobulin heavy diversity 5-24 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5510]
+VEMATI
+>ENSP00000430248.1 pep chromosome:GRCh38:14:105884870:105884888:-1 gene:ENSG00000227196.1 transcript:ENST00000437320.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-23 description:immunoglobulin heavy diversity 4-23 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5504]
+*LRW*L
+>ENSP00000429952.1 pep chromosome:GRCh38:14:105886031:105886061:-1 gene:ENSG00000211911.1 transcript:ENST00000390571.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-22 description:immunoglobulin heavy diversity 3-22 [Source:HGNC Symbol;Acc:HGNC:5497]
+VLL***WLLL
+>ENSP00000429324.1 pep chromosome:GRCh38:14:105888551:105888578:-1 gene:ENSG00000211912.1 transcript:ENST00000390572.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-21 description:immunoglobulin heavy diversity 2-21 [Source:HGNC Symbol;Acc:HGNC:5491]
+SILWW*LLF
+>ENSP00000418010.1 pep chromosome:GRCh38:14:105891699:105891719:-1 gene:ENSG00000211914.1 transcript:ENST00000390574.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-19 description:immunoglobulin heavy diversity 6-19 [Source:HGNC Symbol;Acc:HGNC:5515]
+GYSSGWY
+>ENSP00000417555.1 pep chromosome:GRCh38:14:105893542:105893561:-1 gene:ENSG00000211915.1 transcript:ENST00000390575.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-18 description:immunoglobulin heavy diversity 5-18 [Source:HGNC Symbol;Acc:HGNC:5509]
+VDTAMV
+>ENSP00000428366.1 pep chromosome:GRCh38:14:105895634:105895670:-1 gene:ENSG00000211917.1 transcript:ENST00000390577.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-16 description:immunoglobulin heavy diversity 3-16 [Source:HGNC Symbol;Acc:HGNC:5496]
+VL*LRLGELSLY
+>ENSP00000431089.1 pep chromosome:GRCh38:14:105894508:105894523:-1 gene:ENSG00000227800.1 transcript:ENST00000431870.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-17 description:immunoglobulin heavy diversity 4-17 [Source:HGNC Symbol;Acc:HGNC:5503]
+*LR*L
+>ENSP00000420556.1 pep chromosome:GRCh38:14:105891191:105891207:-1 gene:ENSG00000237020.1 transcript:ENST00000450276.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-20 description:immunoglobulin heavy diversity 1-20 [Source:HGNC Symbol;Acc:HGNC:5484]
+GITGT
+>ENSP00000427969.1 pep chromosome:GRCh38:14:105897957:105897987:-1 gene:ENSG00000211918.1 transcript:ENST00000390578.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-15 description:immunoglobulin heavy diversity 2-15 [Source:HGNC Symbol;Acc:HGNC:5489]
+RIL*WW*LLL
+>ENSP00000418765.1 pep chromosome:GRCh38:14:105900638:105900654:-1 gene:ENSG00000227108.1 transcript:ENST00000451044.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-14 description:immunoglobulin heavy diversity 1-14 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5483]
+GITGT
+>ENSP00000419564.1 pep chromosome:GRCh38:14:105901142:105901162:-1 gene:ENSG00000211920.1 transcript:ENST00000390580.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-13 description:immunoglobulin heavy diversity 6-13 [Source:HGNC Symbol;Acc:HGNC:5514]
+GYSSSWY
+>ENSP00000419283.1 pep chromosome:GRCh38:14:105902649:105902671:-1 gene:ENSG00000211921.1 transcript:ENST00000390581.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-12 description:immunoglobulin heavy diversity 5-12 [Source:HGNC Symbol;Acc:HGNC:5508]
+VDIVATI
+>ENSP00000430034.1 pep chromosome:GRCh38:14:105903616:105903631:-1 gene:ENSG00000232543.2 transcript:ENST00000431440.2 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-11 description:immunoglobulin heavy diversity 4-11 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5502]
+*LQ*L
+>ENSP00000419773.1 pep chromosome:GRCh38:14:105904497:105904527:-1 gene:ENSG00000211923.1 transcript:ENST00000390583.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-10 description:immunoglobulin heavy diversity 3-10 [Source:HGNC Symbol;Acc:HGNC:5495]
+VLLWFGELL
+>ENSP00000488840.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105920273:105920289:-1 gene:ENSG00000282714.1 transcript:ENST00000633210.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-1 description:immunoglobulin heavy diversity 1-1 [Source:HGNC Symbol;Acc:HGNC:5482]
+GTTGT
+>ENSP00000475053.2 pep chromosome:GRCh38:15:21011451:21011469:-1 gene:ENSG00000270451.1 transcript:ENST00000603693.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4OR15-4B description:immunoglobulin heavy diversity 4/OR15-4B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5507]
+*LWC*L
+>ENSP00000474222.1 pep chromosome:GRCh38:15:21017800:21017816:-1 gene:ENSG00000270185.1 transcript:ENST00000604838.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1OR15-1B description:immunoglobulin heavy diversity 1/OR15-1B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5488]
+GITGT
+>ENSP00000473700.1 pep chromosome:GRCh38:15:21010494:21010516:-1 gene:ENSG00000270824.1 transcript:ENST00000604446.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5OR15-5B description:immunoglobulin heavy diversity 5/OR15-5B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5513]
+VDIVSTI
+>ENSP00000474017.2 pep chromosome:GRCh38:15:21015048:21015078:-1 gene:ENSG00000282268.1 transcript:ENST00000604102.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2OR15-2B description:immunoglobulin heavy diversity 2/OR15-2B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5494]
+RIL**YYFLC
+>ENSP00000474573.2 pep chromosome:GRCh38:15:21012559:21012589:-1 gene:ENSG00000282089.1 transcript:ENST00000603935.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3OR15-3B description:immunoglobulin heavy diversity 3/OR15-3B (non-functional) [Source:HGNC Symbol;Acc:HGNC:5501]
+VL*FLDWLLY
+>ENSP00000488695.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105866322:105866332:-1 gene:ENSG00000282455.1 transcript:ENST00000632524.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD7-27 description:immunoglobulin heavy diversity 7-27 [Source:HGNC Symbol;Acc:HGNC:5518]
+LTG
+>ENSP00000488000.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105881805:105881824:-1 gene:ENSG00000282323.1 transcript:ENST00000633009.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-26 description:immunoglobulin heavy diversity 1-26 [Source:HGNC Symbol;Acc:HGNC:5485]
+GIVGAT
+>ENSP00000488392.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105882310:105882327:-1 gene:ENSG00000282724.1 transcript:ENST00000634070.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-25 description:immunoglobulin heavy diversity 6-25 [Source:HGNC Symbol;Acc:HGNC:5516]
+GYSSGY
+>ENSP00000488113.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105884674:105884693:-1 gene:ENSG00000282674.1 transcript:ENST00000632963.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-24 description:immunoglobulin heavy diversity 5-24 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5510]
+VEMATI
+>ENSP00000488168.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105885641:105885659:-1 gene:ENSG00000282640.1 transcript:ENST00000633030.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-23 description:immunoglobulin heavy diversity 4-23 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5504]
+*LRW*L
+>ENSP00000488711.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105886802:105886832:-1 gene:ENSG00000282396.1 transcript:ENST00000633765.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-22 description:immunoglobulin heavy diversity 3-22 [Source:HGNC Symbol;Acc:HGNC:5497]
+VLL***WLLL
+>ENSP00000487599.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105889322:105889349:-1 gene:ENSG00000281984.1 transcript:ENST00000632619.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-21 description:immunoglobulin heavy diversity 2-21 [Source:HGNC Symbol;Acc:HGNC:5491]
+SILWW*LLF
+>ENSP00000488201.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105894313:105894332:-1 gene:ENSG00000282346.1 transcript:ENST00000631871.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-18 description:immunoglobulin heavy diversity 5-18 [Source:HGNC Symbol;Acc:HGNC:5509]
+VDTAMV
+>ENSP00000487787.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105896405:105896441:-1 gene:ENSG00000282232.1 transcript:ENST00000633379.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-16 description:immunoglobulin heavy diversity 3-16 [Source:HGNC Symbol;Acc:HGNC:5496]
+VL*LRLGELSLY
+>ENSP00000488261.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105895279:105895294:-1 gene:ENSG00000282274.1 transcript:ENST00000633010.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-17 description:immunoglobulin heavy diversity 4-17 [Source:HGNC Symbol;Acc:HGNC:5503]
+*LR*L
+>ENSP00000487789.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105892470:105892490:-1 gene:ENSG00000282487.1 transcript:ENST00000633159.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-19 description:immunoglobulin heavy diversity 6-19 [Source:HGNC Symbol;Acc:HGNC:5515]
+GYSSGWY
+>ENSP00000487812.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105891962:105891978:-1 gene:ENSG00000282592.1 transcript:ENST00000632968.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-20 description:immunoglobulin heavy diversity 1-20 [Source:HGNC Symbol;Acc:HGNC:5484]
+GITGT
+>ENSP00000487993.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105898728:105898758:-1 gene:ENSG00000282818.1 transcript:ENST00000632473.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-15 description:immunoglobulin heavy diversity 2-15 [Source:HGNC Symbol;Acc:HGNC:5489]
+RIL*WW*LLL
+>ENSP00000488522.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105901409:105901425:-1 gene:ENSG00000282736.1 transcript:ENST00000631884.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-14 description:immunoglobulin heavy diversity 1-14 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5483]
+GITGT
+>ENSP00000488592.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105901913:105901933:-1 gene:ENSG00000282042.1 transcript:ENST00000632859.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-13 description:immunoglobulin heavy diversity 6-13 [Source:HGNC Symbol;Acc:HGNC:5514]
+GYSSSWY
+>ENSP00000487922.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105903420:105903442:-1 gene:ENSG00000282102.1 transcript:ENST00000631895.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-12 description:immunoglobulin heavy diversity 5-12 [Source:HGNC Symbol;Acc:HGNC:5508]
+VDIVATI
+>ENSP00000488735.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105904387:105904402:-1 gene:ENSG00000281940.1 transcript:ENST00000634154.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-11 description:immunoglobulin heavy diversity 4-11 (non-functional) [Source:HGNC Symbol;Acc:HGNC:5502]
+*LQ*L
+>ENSP00000488475.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105905268:105905298:-1 gene:ENSG00000282373.1 transcript:ENST00000632609.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-10 description:immunoglobulin heavy diversity 3-10 [Source:HGNC Symbol;Acc:HGNC:5495]
+VLLWFGELL
+>ENSP00000487775.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105905452:105905482:-1 gene:ENSG00000281939.1 transcript:ENST00000632911.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-9 description:immunoglobulin heavy diversity 3-9 [Source:HGNC Symbol;Acc:HGNC:5499]
+VLRYFDWLL
+>ENSP00000488083.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105907982:105908012:-1 gene:ENSG00000282132.1 transcript:ENST00000633504.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-8 description:immunoglobulin heavy diversity 2-8 [Source:HGNC Symbol;Acc:HGNC:5492]
+RILY*WCMLY
+>ENSP00000488720.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105910678:105910694:-1 gene:ENSG00000282495.1 transcript:ENST00000632304.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-7 description:immunoglobulin heavy diversity 1-7 [Source:HGNC Symbol;Acc:HGNC:5486]
+GITGT
+>ENSP00000488589.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105911181:105911198:-1 gene:ENSG00000282010.1 transcript:ENST00000632542.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-6 description:immunoglobulin heavy diversity 6-6 [Source:HGNC Symbol;Acc:HGNC:5517]
+EYSSSS
+>ENSP00000487937.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105913028:105913047:-1 gene:ENSG00000282769.1 transcript:ENST00000633968.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-5 description:immunoglobulin heavy diversity 5-5 [Source:HGNC Symbol;Acc:HGNC:5511]
+VDTAMV
+>ENSP00000488889.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105913993:105914008:-1 gene:ENSG00000282227.1 transcript:ENST00000634085.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-4 description:immunoglobulin heavy diversity 4-4 [Source:HGNC Symbol;Acc:HGNC:5505]
+*LQ*L
+>ENSP00000487903.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105915130:105915160:-1 gene:ENSG00000282754.1 transcript:ENST00000633353.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-3 description:immunoglobulin heavy diversity 3-3 [Source:HGNC Symbol;Acc:HGNC:5498]
+VLRFLEWLLY
+>ENSP00000487604.1 pep chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105917597:105917627:-1 gene:ENSG00000282578.1 transcript:ENST00000631803.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-2 description:immunoglobulin heavy diversity 2-2 [Source:HGNC Symbol;Acc:HGNC:5490]
+RIL**YQLLC
+>ENSP00000419583.1 pep chromosome:GRCh38:14:105904681:105904711:-1 gene:ENSG00000211924.1 transcript:ENST00000390584.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-9 description:immunoglobulin heavy diversity 3-9 [Source:HGNC Symbol;Acc:HGNC:5499]
+VLRYFDWLL
+>ENSP00000428616.1 pep chromosome:GRCh38:14:105907211:105907241:-1 gene:ENSG00000211925.1 transcript:ENST00000390585.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-8 description:immunoglobulin heavy diversity 2-8 [Source:HGNC Symbol;Acc:HGNC:5492]
+RILY*WCMLY
+>ENSP00000420794.1 pep chromosome:GRCh38:14:105909907:105909923:-1 gene:ENSG00000237197.1 transcript:ENST00000430425.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-7 description:immunoglobulin heavy diversity 1-7 [Source:HGNC Symbol;Acc:HGNC:5486]
+GITGT
+>ENSP00000418151.1 pep chromosome:GRCh38:14:105910410:105910427:-1 gene:ENSG00000228131.1 transcript:ENST00000454691.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD6-6 description:immunoglobulin heavy diversity 6-6 [Source:HGNC Symbol;Acc:HGNC:5517]
+EYSSSS
+>ENSP00000417892.1 pep chromosome:GRCh38:14:105912257:105912276:-1 gene:ENSG00000211928.1 transcript:ENST00000390588.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5-5 description:immunoglobulin heavy diversity 5-5 [Source:HGNC Symbol;Acc:HGNC:5511]
+VDTAMV
+>ENSP00000428393.1 pep chromosome:GRCh38:14:105913222:105913237:-1 gene:ENSG00000233655.1 transcript:ENST00000414852.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-4 description:immunoglobulin heavy diversity 4-4 [Source:HGNC Symbol;Acc:HGNC:5505]
+*LQ*L
+>ENSP00000420442.1 pep chromosome:GRCh38:14:105914359:105914389:-1 gene:ENSG00000211930.1 transcript:ENST00000390590.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3-3 description:immunoglobulin heavy diversity 3-3 [Source:HGNC Symbol;Acc:HGNC:5498]
+VLRFLEWLLY
+>ENSP00000430788.1 pep chromosome:GRCh38:14:105916826:105916856:-1 gene:ENSG00000211931.1 transcript:ENST00000390591.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2-2 description:immunoglobulin heavy diversity 2-2 [Source:HGNC Symbol;Acc:HGNC:5490]
+RIL**YQLLC
+>ENSP00000418625.1 pep chromosome:GRCh38:14:105919502:105919518:-1 gene:ENSG00000236170.1 transcript:ENST00000454908.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1-1 description:immunoglobulin heavy diversity 1-1 [Source:HGNC Symbol;Acc:HGNC:5482]
+GTTGT
+>ENSP00000473849.1 pep chromosome:GRCh38:15:20003840:20003862:-1 gene:ENSG00000270961.1 transcript:ENST00000604642.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD5OR15-5A description:immunoglobulin heavy diversity 5/OR15-5A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5512]
+VDIVSTI
+>ENSP00000474693.2 pep chromosome:GRCh38:15:20004797:20004815:-1 gene:ENSG00000271317.1 transcript:ENST00000603326.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4OR15-4A description:immunoglobulin heavy diversity 4/OR15-4A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5506]
+*LWC*L
+>ENSP00000474133.2 pep chromosome:GRCh38:15:20005905:20005935:-1 gene:ENSG00000282520.1 transcript:ENST00000604950.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD3OR15-3A description:immunoglobulin heavy diversity 3/OR15-3A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5500]
+VL*FLDWLLY
+>ENSP00000474065.2 pep chromosome:GRCh38:15:20008402:20008432:-1 gene:ENSG00000282599.1 transcript:ENST00000603077.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD2OR15-2A description:immunoglobulin heavy diversity 2/OR15-2A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5493]
+RIL**YYFLC
+>ENSP00000473787.1 pep chromosome:GRCh38:15:20011153:20011169:-1 gene:ENSG00000271336.1 transcript:ENST00000605284.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD1OR15-1A description:immunoglobulin heavy diversity 1/OR15-1A (non-functional) [Source:HGNC Symbol;Acc:HGNC:5487]
+GITGT
+>ENSP00000487939.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142300947:142301455:1 gene:ENSG00000282568.1 transcript:ENST00000632828.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV2 description:T cell receptor beta variable 2 [Source:HGNC Symbol;Acc:HGNC:12195]
+MDTWLVCWAIFSLLKAGLTEPEVTQTPSHQVTQMGQEVILHCVPISNHLYFYWYRQILGQ
+KVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE
+>ENSP00000488814.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142308565:142309071:1 gene:ENSG00000282624.1 transcript:ENST00000632422.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV3-1 description:T cell receptor beta variable 3-1 [Source:HGNC Symbol;Acc:HGNC:12212]
+MGCRLLCCVVFCLLQAGPLDTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKK
+FLKIMFSYNNKELIINETVPNRFSPKSPDKAHLNLHINSLELGDSAVYFCASSQ
+>ENSP00000488131.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142313207:142313689:1 gene:ENSG00000282014.1 transcript:ENST00000632713.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-1 description:T cell receptor beta variable 4-1 [Source:HGNC Symbol;Acc:HGNC:12215]
+MGCRLLCCAVLCLLGAVPIDTEVTQTPKHLVMGMTNKKSLKCEQHMGHRAMYWYKQKAKK
+PPELMFVYSYEKLSINESVRSRFSPECPNSSLLNLHLHALQPEDSALYLCASSQ
+>ENSP00000488308.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142320696:142321563:1 gene:ENSG00000282803.1 transcript:ENST00000633384.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-1 description:T cell receptor beta variable 5-1 [Source:HGNC Symbol;Acc:HGNC:12218]
+MGSRLLCWVLLCLLGAGPVKAGVTQTPRYLIKTRGQQVTLSCSPISGHRSVSWYQQTPGQ
+GLQFLFEYFSETQRNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYLCASSL
+>ENSP00000488756.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142328321:142328810:1 gene:ENSG00000281970.1 transcript:ENST00000631557.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-1 description:T cell receptor beta variable 6-1 [Source:HGNC Symbol;Acc:HGNC:12226]
+MSIGLLCCVAFSLLWASPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHNSMYWYRQDPGM
+GLRLIYYSASEGTTDKGEVPNGYNVSRLNKREFSLRLESAAPSQTSVYFCASSE
+>ENSP00000488287.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142332206:142332727:1 gene:ENSG00000282225.1 transcript:ENST00000632308.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-1 description:T cell receptor beta variable 7-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12235]
+MGTRLLCWAAICLLGADHTGAGVSQSLRHKVAKKGKDVALRYDPISGHNALYWYRQSLGQ
+GLEFPIYFQGKDAADKSGLPRDRFSAQRSEGSISTLKFQRTQQGDLAVYLCASSS
+>ENSP00000487667.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142345452:142346016:1 gene:ENSG00000282285.1 transcript:ENST00000632512.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-2 description:T cell receptor beta variable 4-2 [Source:HGNC Symbol;Acc:HGNC:12216]
+MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKK
+PLELMFVYNFKEQTENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
+>ENSP00000488603.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142349215:142349695:1 gene:ENSG00000282719.1 transcript:ENST00000632016.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-2 description:T cell receptor beta variable 6-2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12227]
+MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGM
+GLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
+>ENSP00000488576.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142362570:142363134:1 gene:ENSG00000282543.1 transcript:ENST00000631427.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC234635.3 description:T cell receptor beta variable 4-3 [Source:UniProtKB/Swiss-Prot;Acc:A0A589]
+MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKK
+PLELMFVYSLEERVENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
+>ENSP00000488127.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142370836:142371348:1 gene:ENSG00000282353.1 transcript:ENST00000632148.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC234635.1 description:T cell receptor beta variable 6-3 [Source:UniProtKB/Swiss-Prot;Acc:P0DPF7]
+MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGM
+GLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
+>ENSP00000488152.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142374511:142375050:1 gene:ENSG00000282506.1 transcript:ENST00000631392.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-2 description:T cell receptor beta variable 7-2 [Source:HGNC Symbol;Acc:HGNC:12236]
+MGTRLLFWVAFCLLGADHTGAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQ
+GLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL
+>ENSP00000487798.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142402503:142402958:1 gene:ENSG00000282240.1 transcript:ENST00000633472.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-4 description:T cell receptor beta variable 6-4 [Source:HGNC Symbol;Acc:HGNC:12229]
+MRIRLLCCVAFSLLWAGPVIAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGL
+GLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD
+>ENSP00000488108.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142406045:142406551:1 gene:ENSG00000282203.1 transcript:ENST00000631882.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-3 description:T cell receptor beta variable 7-3 [Source:HGNC Symbol;Acc:HGNC:12237]
+MGTRLLCWAALCLLGADHTGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQ
+GPEFLIYFQGTGAADDSGLPNDRFFAVRPEGSVSTLKIQRTERGDSAVYLCASSL
+>ENSP00000488267.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142410913:142411379:1 gene:ENSG00000282148.1 transcript:ENST00000634123.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-3 description:T cell receptor beta variable 5-3 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12220]
+MGPGLLCWELLYLLGAGPVEAGVTQSPTHLIKTRGQQVTLRCSPISGHSSVSWYQQAPGQ
+GPQFIFEYANELRRSEGNFPNRFSGRQFHDCCSEMNVSALELGDSALYLCARSL
+>ENSP00000488515.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142413602:142414123:1 gene:ENSG00000282204.1 transcript:ENST00000633328.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV9 description:T cell receptor beta variable 9 [Source:HGNC Symbol;Acc:HGNC:12246]
+MGFRLLCCVAFCLLGAGPVDSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQ
+GLQFLIQYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV
+>ENSP00000488035.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142421573:142422090:1 gene:ENSG00000282618.1 transcript:ENST00000632248.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-1 description:T cell receptor beta variable 10-1(gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12177]
+MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGH
+GLRLIHYSYGVHDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE
+>ENSP00000488521.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142429379:142429843:1 gene:ENSG00000282711.1 transcript:ENST00000634176.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-1 description:T cell receptor beta variable 11-1 [Source:HGNC Symbol;Acc:HGNC:12180]
+MSTRLLCWMALCLLGAELSEAEVAQSPRYKITEKSQAVAFWCDPISGHATLYWYRQILGQ
+GPELLVQFQDESVVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAMYLCASSL
+>ENSP00000488043.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142446652:142447152:1 gene:ENSG00000282007.1 transcript:ENST00000633575.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-2 description:T cell receptor beta variable 10-2 [Source:HGNC Symbol;Acc:HGNC:12178]
+MGTRLFFYVALCLLWAGHRDAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGH
+GLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE
+>ENSP00000488123.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142472647:142473148:1 gene:ENSG00000277110.3 transcript:ENST00000633072.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-5 description:T cell receptor beta variable 6-5 [Source:HGNC Symbol;Acc:HGNC:12230]
+MSIGLLCCAALSLLWAGPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGM
+GLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSY
+>ENSP00000488823.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142476873:142477334:1 gene:ENSG00000282756.1 transcript:ENST00000633313.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-4 description:T cell receptor beta variable 7-4 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12238]
+MGTRLLCWVVLGFLGTDHTGAGVSQSPRYKVAKRGRDVALRCDSISGHVTLYWYRQTLGQ
+GSEVLTYSQSDAQRDKSGRPSGRFSAERPERSVSTLKIQCTEQGDSAVYLCASSL
+>ENSP00000488374.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142484618:142485283:1 gene:ENSG00000282466.1 transcript:ENST00000633696.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-4 description:T cell receptor beta variable 5-4 [Source:HGNC Symbol;Acc:HGNC:12221]
+MGPGLLCWALLCLLGAGSVDAGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQ
+GPQFIFQYYREEENGRGNFPPRFSGLQFPNDSSELNVNALELDDSALYLCASSL
+>ENSP00000488741.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142491254:142491732:1 gene:ENSG00000282459.1 transcript:ENST00000633963.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-6 description:T cell receptor beta variable 6-6 [Source:HGNC Symbol;Acc:HGNC:12231]
+MSISLLCCAAFPLLWAGPVNAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGM
+GLKLIYYSVGAGITDKGEVPNGYNVSRSTTEYFPLRLELAAPSQTSVYFCASSY
+>ENSP00000488241.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142504264:142504735:1 gene:ENSG00000282577.1 transcript:ENST00000632187.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-5 description:T cell receptor beta variable 5-5 [Source:HGNC Symbol;Acc:HGNC:12222]
+MGPGLLCWVLLCLLVAGPVDAGVTQSPTHLIKTRGQQVTLRCSPISGHKSVSWYQQVLGQ
+GPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASSL
+>ENSP00000488335.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142509537:142510011:1 gene:ENSG00000282470.1 transcript:ENST00000631511.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-7 description:T cell receptor beta variable 6-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12232]
+MSLGLLCCVAFSLLWAGPMNAGVTQTPKFHVLKTGQSMTLLCAQDMNHEYMYRYRQDPGK
+GLRLIYYSVAAALTDKGEVPNGYNVSRSNTEDFPLKLESAAPSQTSVYFCASSY
+>ENSP00000488212.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142513848:142514385:1 gene:ENSG00000282704.1 transcript:ENST00000633265.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-6 description:T cell receptor beta variable 7-6 [Source:HGNC Symbol;Acc:HGNC:12240]
+MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQ
+GPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSVSTLTIQRTEQRDSAMYRCASSL
+>ENSP00000487850.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142521745:142522251:1 gene:ENSG00000282098.1 transcript:ENST00000632216.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-6 description:T cell receptor beta variable 5-6 [Source:HGNC Symbol;Acc:HGNC:12223]
+MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKSGHDTVSWYQQALGQ
+GPQFIFQYYEEEERQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYLCASSL
+>ENSP00000488870.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142529032:142529526:1 gene:ENSG00000282134.1 transcript:ENST00000632425.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-8 description:T cell receptor beta variable 6-8 [Source:HGNC Symbol;Acc:HGNC:12233]
+MSLGLLCCAAFSLLWAGPVNAGVTQTPKFHILKTGQSMTLQCAQDMNHGYMSWYRQDPGM
+GLRLIYYSAAAGTTDKEVPNGYNVSRLNTEDFPLRLVSAAPSRTSVYLCASSY
+>ENSP00000488424.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142533342:142533843:1 gene:ENSG00000282179.1 transcript:ENST00000631548.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-7 description:T cell receptor beta variable 7-7 [Source:HGNC Symbol;Acc:HGNC:12241]
+MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVTLRCDPISSHATLYWYQQALGQ
+GPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
+>ENSP00000488478.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142541804:142542270:1 gene:ENSG00000282748.1 transcript:ENST00000633790.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-7 description:T cell receptor beta variable 5-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12224]
+MGPGLLCWVLLCPLGEGPVDAGVTQSPTHLIKTRGQHVTLRCSPISGHTSVSSYQQALGQ
+GPQFIFQYYEKEERGRGNFPDQFSGHQFPNYSSELNVNALLLGDSALYLCASSL
+>ENSP00000488280.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142549110:142549542:1 gene:ENSG00000282610.1 transcript:ENST00000634093.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC233282.2 description:T cell receptor beta variable 6-9 [Source:UniProtKB/Swiss-Prot;Acc:A0A0J9YX75]
+MSIGLLCCVAFSLLWAGPVNAGVTQTPKFHILKTGQSMTLQCAQDMNHGYLSWYRQDPGM
+GLRRIHYSVAAGITDKGEVPDGYNVSRSNTEDFPLRLESAAPSQTSVYFCASSY
+>ENSP00000488190.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142553725:142554208:1 gene:ENSG00000282040.1 transcript:ENST00000632560.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC233282.1 description:T cell receptor beta variable 7-8 [Source:UniProtKB/Swiss-Prot;Acc:A0A1B0GX51]
+MGTRLLCWVVLGFLGTDHTGAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQ
+GPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQQEDSAVYLCASSL
+>ENSP00000488017.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142561449:142562408:1 gene:ENSG00000282054.1 transcript:ENST00000631639.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:AC229888.1 description:T cell receptor beta variable 5-8 [Source:UniProtKB/Swiss-Prot;Acc:A0A5A2]
+MGPRLLFWALLCLLGTGPVEAGVTQSPTHLIKTRGQQATLRCSPISGHTSVYWYQQALGL
+GLQFLLWYDEGEERNRGNFPPRFSGRQFPNYSSELNVNALELEDSALYLCASSL
+>ENSP00000487884.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142571143:142571615:1 gene:ENSG00000281943.1 transcript:ENST00000632021.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-9 description:T cell receptor beta variable 7-9 [Source:HGNC Symbol;Acc:HGNC:12243]
+MGTSLLCWMALCLLGADHADTGVSQDPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQ
+GPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
+>ENSP00000488778.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142577660:142578143:1 gene:ENSG00000282407.1 transcript:ENST00000633796.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV13 description:T cell receptor beta variable 13 [Source:HGNC Symbol;Acc:HGNC:12188]
+MLSPDLPDSAWNTRLLCRVMLCLLGAGSVAAGVIQSPRHLIKEKRETATLKCYPIPRHDT
+VYWYQQGPGQDPQFLISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFC
+ASSL
+>ENSP00000487891.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142586067:142586516:1 gene:ENSG00000282340.1 transcript:ENST00000631471.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-3 description:T cell receptor beta variable 10-3 [Source:HGNC Symbol;Acc:HGNC:12179]
+MGTRLFFYVALCLLWTGHMDAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGH
+GLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE
+>ENSP00000487749.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142596665:142597147:1 gene:ENSG00000282242.1 transcript:ENST00000634111.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-3 description:T cell receptor beta variable 11-3 [Source:HGNC Symbol;Acc:HGNC:12182]
+MGTRLLCWVAFCLLVEELIEAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYLQNLGQ
+GPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL
+>ENSP00000487964.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142602235:142602743:1 gene:ENSG00000282208.1 transcript:ENST00000633292.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-3 description:T cell receptor beta variable 12-3 [Source:HGNC Symbol;Acc:HGNC:12185]
+MDSWTFCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMR
+GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
+>ENSP00000488855.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142605549:142606054:1 gene:ENSG00000282354.1 transcript:ENST00000631824.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-4 description:T cell receptor beta variable 12-4 [Source:HGNC Symbol;Acc:HGNC:12186]
+MDSWTLCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMR
+GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
+>ENSP00000488633.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142622755:142623265:1 gene:ENSG00000282605.1 transcript:ENST00000632829.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-5 description:T cell receptor beta variable 12-5 [Source:HGNC Symbol;Acc:HGNC:12187]
+MATRLLCCVVLCLLGEELIDARVTQTPRDKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQ
+GLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGL
+>ENSP00000488641.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142629704:142630195:1 gene:ENSG00000282252.1 transcript:ENST00000632432.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV14 description:T cell receptor beta variable 14 [Source:HGNC Symbol;Acc:HGNC:12189]
+MVSRLLSLVSLCLLGAKHIEAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGK
+EIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQ
+>ENSP00000488551.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142634764:142635309:1 gene:ENSG00000282497.1 transcript:ENST00000631835.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV15 description:T cell receptor beta variable 15 [Source:HGNC Symbol;Acc:HGNC:12190]
+MGPGLLHWMALCLLGTGHGDAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQ
+APKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYLCATSR
+>ENSP00000487913.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142639852:142640305:1 gene:ENSG00000282415.1 transcript:ENST00000633244.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV16 description:T cell receptor beta variable 16 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12191]
+MSPIFTCITILCLLAAGSPGEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKN
+EFKFLISFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASSQ
+>ENSP00000488775.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142643462:142644194:1 gene:ENSG00000282483.1 transcript:ENST00000631663.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV17 description:T cell receptor beta variable 17 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12192]
+MDIWLLCWVTLCLLAAGHSEPGVSQTPRHKVTNMGQEVILRCDPSSGHMFVHWYRQNLRQ
+EMKLLISFQYQNIAVDSGMPKERFTAERPNGTSSTLKIHPAEPRDSAVYLYSSG
+>ENSP00000488621.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142657499:142658198:1 gene:ENSG00000282771.1 transcript:ENST00000631559.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV18 description:T cell receptor beta variable 18 [Source:HGNC Symbol;Acc:HGNC:12193]
+MDTRVLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEE
+GLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSP
+>ENSP00000487807.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142660632:142661315:1 gene:ENSG00000282621.1 transcript:ENST00000632638.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV19 description:T cell receptor beta variable 19 [Source:HGNC Symbol;Acc:HGNC:12194]
+MSNQVLCCVVLCFLGANTVDGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQ
+GLRLIYYSQIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI
+>ENSP00000488099.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142668431:142669181:1 gene:ENSG00000282064.1 transcript:ENST00000633466.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV20-1 description:T cell receptor beta variable 20-1 [Source:HGNC Symbol;Acc:HGNC:12196]
+MLLLLLLLGPGSGLGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML
+MATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
+>ENSP00000487718.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142687733:142688229:1 gene:ENSG00000282449.1 transcript:ENST00000633842.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV23-1 description:T cell receptor beta variable 23-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12201]
+MGTRLLGCAALCLLAADSFHAKVTQTPGHLVKGKGQKTKMDCTPEKGHTFVYWYQQNQNK
+EFMLLISFQNEQVLQETEMHKKRFSSQCPKNAPCSLAILSSEPGDTALYLCASSQ
+>ENSP00000488057.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142697038:142697550:1 gene:ENSG00000282730.1 transcript:ENST00000633092.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV24-1 description:T cell receptor beta variable 24-1 [Source:HGNC Symbol;Acc:HGNC:12203]
+MASLLFFCGAFYLLGTGSMDADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGL
+GLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
+>ENSP00000479511.2 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142711413:142711917:1 gene:ENSG00000281963.1 transcript:ENST00000610439.4 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV25-1 description:T cell receptor beta variable 25-1 [Source:HGNC Symbol;Acc:HGNC:12205]
+MTIRLLCYMGFYFLGAGLMEADIYQTPRYLVIGTGKKITLECSQTMGHDKMYWYQQDPGM
+ELHLIHYSYGVNSTEKGDLSSESTVSRIRTEHFPLTLESARPSHTSQYLCASSE
+>ENSP00000488274.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142756048:142756563:1 gene:ENSG00000282234.1 transcript:ENST00000633283.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV27 description:T cell receptor beta variable 27 [Source:HGNC Symbol;Acc:HGNC:12208]
+MGPQLLGYVVLCLLGAGPLEAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGL
+GLRQIYYSMNVEVTDKGDVPEGYKVSRKEKRNFPLILESPSPNQTSLYFCASSL
+>ENSP00000480928.2 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142761362:142761862:1 gene:ENSG00000282812.1 transcript:ENST00000619125.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV28 description:T cell receptor beta variable 28 [Source:HGNC Symbol;Acc:HGNC:12209]
+MGIRLLCRVAFCFLAVGLVDVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGL
+GLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASSL
+>ENSP00000488861.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142780919:142781607:1 gene:ENSG00000282628.1 transcript:ENST00000634198.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV29-1 description:T cell receptor beta variable 29-1 [Source:HGNC Symbol;Acc:HGNC:12210]
+MLSLLLLLLGLGSVFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTL
+IATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVE
+>ENSP00000487814.1 pep chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142873679:142874492:-1 gene:ENSG00000282297.1 transcript:ENST00000631690.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV30 description:T cell receptor beta variable 30 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12214]
+MLCSLLALLLGTFFGVRSQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRG
+LQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS
+>ENSP00000368747.3 pep chromosome:GRCh38:9:33617762:33618506:1 gene:ENSG00000205274.3 transcript:ENST00000379435.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV20OR9-2 description:T cell receptor beta variable 20/OR9-2 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12197]
+METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVICKSGTSVNIECRSLD
+FQATTMFWYRQLRKQSLMLMATSNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPE
+DSSFYICSAR
+>ENSP00000374867.2 pep chromosome:GRCh38:7:38349355:38350022:-1 gene:ENSG00000211697.4 transcript:ENST00000390344.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV5 description:T cell receptor gamma variable 5 [Source:HGNC Symbol;Acc:HGNC:12290]
+MRWALLVLLAFLSPASQKSSNLEGGTKSVTRPTRSSAEITCDLTVINAFYIHWYLHQEGK
+APQRLLYYDVSNSKDVLESGLSPGKYYTHTPRRWSWILILRNLIENDSGVYYCATWDR
+>ENSP00000404928.2 pep chromosome:GRCh38:7:38362864:38363518:-1 gene:ENSG00000233306.2 transcript:ENST00000426402.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV2 description:T cell receptor gamma variable 2 [Source:HGNC Symbol;Acc:HGNC:12287]
+MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSNGYIHWYLHQEGK
+APQRLQYYDSYNSKVVLESGVSPGKYYTYASTRNNLRLILRNLIENDFGVYYCATWDG
+>ENSP00000374864.2 pep chromosome:GRCh38:7:38299811:38300322:-1 gene:ENSG00000211694.2 transcript:ENST00000390341.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV10 description:T cell receptor gamma variable 10 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12285]
+MFIGNSPLLLTVGLGLSKVEQFQLSISTEVKKSIDIPCKISSTRFETDVIHWYRQKPNQA
+LEHLIYIVSTKSAARRSMGKTSNKVEARKNSQTLTSILTIKSVEKEDMAVYYCAAWD
+>ENSP00000374866.2 pep chromosome:GRCh38:7:38330343:38330935:-1 gene:ENSG00000211696.2 transcript:ENST00000390343.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV8 description:T cell receptor gamma variable 8 [Source:HGNC Symbol;Acc:HGNC:12294]
+MLLALALLLAFLPPASQKSSNLEGRTKSVTRPTGSSAVITCDLPVENAVYTHWYLHQEGK
+APQRLLYYDSYNSRVVLESGISREKYHTYASTGKSLKFILENLIERDSGVYYCATWDR
+>ENSP00000374869.2 pep chromosome:GRCh38:7:38358512:38359162:-1 gene:ENSG00000211699.2 transcript:ENST00000390346.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV3 description:T cell receptor gamma variable 3 [Source:HGNC Symbol;Acc:HGNC:12288]
+MRWALLVLLAFLSPASQKSSNLEGRTKSVTRQTGSSAEITCDLTVTNTFYIHWYLHQEGK
+APQRLLYYDVSTARDVLESGLSPGKYYTHTPRRWSWILRLQNLIENDSGVYYCATWDR
+>ENSP00000391561.2 pep chromosome:GRCh38:7:38317017:38318861:-1 gene:ENSG00000211695.2 transcript:ENST00000444775.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV9 description:T cell receptor gamma variable 9 [Source:HGNC Symbol;Acc:HGNC:12295]
+MLSLLHTSTLAVLGALCVYGAGHLEQPQISSTKTLSKTARLECVVSGITISATSVYWYRE
+RPGEVIQFLVSISYDGTVRKESGIPSGKFEVDRIPETSTSTLTIHNVEKQDIATYYCALW
+EV
+>ENSP00000374868.2 pep chromosome:GRCh38:7:38353715:38354517:-1 gene:ENSG00000211698.2 transcript:ENST00000390345.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV4 description:T cell receptor gamma variable 4 [Source:HGNC Symbol;Acc:HGNC:12289]
+MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSTGYIHWYLHQEGK
+APQRLLYYDSYTSSVVLESGISPGKYDTYGSTRKNLRMILRNLIENDSGVYYCATWDG
+>ENSP00000374871.2 pep chromosome:GRCh38:7:38367586:38368169:-1 gene:ENSG00000211701.2 transcript:ENST00000390348.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV1 description:T cell receptor gamma variable 1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12284]
+MRWALAVLLAFLSPASQISSNLEGRTKSVTRLTGSSAEITCDLPGASTLYIHWYLHQEGK
+APQCLLYYEPYYSRVVLESGITPGKYDTGSTRSNWNLRLQNLIKNDSGFYYCATWDR
+>ENSP00000374863.2 pep chromosome:GRCh38:7:38291616:38292078:-1 gene:ENSG00000211693.2 transcript:ENST00000390340.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRGV11 description:T cell receptor gamma variable 11 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12286]
+LGQLEQPEISISRPANKSAHISWKASIQGFSSKIIHWYWQKPNKGLEYLLHVFLTISAQD
+CSGGKTKKLEVSKNAHTSTSTLKIKFLEKEDEVVYHCACWIRH
+>ENSP00000446309.1 pep chromosome:GRCh38:14:21621838:21622567:1 gene:ENSG00000255569.1 transcript:ENST00000542354.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV1-1 description:T cell receptor alpha variable 1-1 [Source:HGNC Symbol;Acc:HGNC:12101]
+MWGAFLLYVSMKMGGTAGQSLEQPSEVTAVEGAIVQINCTYQTSGFYGLSWYQQHDGGAP
+TFLSYNALDGLEETGRFSSFLSRSDSYGYLLLQELQMKDSASYFCAVR
+>ENSP00000439668.1 pep chromosome:GRCh38:14:21642889:21643578:1 gene:ENSG00000256553.1 transcript:ENST00000390423.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV1-2 description:T cell receptor alpha variable 1-2 [Source:HGNC Symbol;Acc:HGNC:12102]
+MWGVFLLYVSMKMGGTTGQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAP
+TFLSYNVLDGLEEKGRFSSFLSRSKGYSYLLLKELQMKDSASYLCA
+>ENSP00000438195.1 pep chromosome:GRCh38:14:21712321:21712843:1 gene:ENSG00000211776.2 transcript:ENST00000390424.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV2 description:T cell receptor alpha variable 2 [Source:HGNC Symbol;Acc:HGNC:12116]
+MALQSTLGAVWLGLLLNSLWKVAESKDQVFQPSTVASSEGAVVEIFCNHSVSNAYNFFWY
+LHFPGCAPRLLVKGSKPSQQGRYNMTYERFSSSLLILQVREADAAVYYCAVE
+>ENSP00000444955.1 pep chromosome:GRCh38:14:21723713:21724321:1 gene:ENSG00000211777.2 transcript:ENST00000390425.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV3 description:T cell receptor alpha variable 3 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12128]
+MASAPISMLAMLFTLSGLRAQSVAQPEDQVNVAEGNPLTVKCTYSVSGNPYLFWYVQYPN
+RGLQFLLKYITGDNLVKGSYGFEAEFNKSQTSFHLKKPSALVSDSALYFCAVRD
+>ENSP00000451535.1 pep chromosome:GRCh38:14:21736152:21736982:1 gene:ENSG00000211778.2 transcript:ENST00000390426.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV4 description:T cell receptor alpha variable 4 [Source:HGNC Symbol;Acc:HGNC:12140]
+MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIATNDYITWYQQFPSQG
+PRFIIQGYKTKVTNEVASLFIPADRKSSTLSLPRVSLSDTAVYYCLVGD
+>ENSP00000446355.1 pep chromosome:GRCh38:14:21749178:21749705:1 gene:ENSG00000211779.3 transcript:ENST00000390427.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV5 description:T cell receptor alpha variable 5 [Source:HGNC Symbol;Acc:HGNC:12143]
+MKTFAGFSFLFLWLQLDCMSRGEDVEQSLFLSVREGDSSVINCTYTDSSSTYLYWYKQEP
+GAGLQLLTYIFSNMDMKQDQRLTVLLNKKDKHLSLRIADTQTGDSAIYFCAES
+>ENSP00000438290.1 pep chromosome:GRCh38:14:21768489:21769080:1 gene:ENSG00000211780.3 transcript:ENST00000390428.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV6 description:T cell receptor alpha variable 6 [Source:HGNC Symbol;Acc:HGNC:12144]
+MAFWLRSLGLHFRPHLGRRMESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATL
+TCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQ
+PADSATYLCALD
+>ENSP00000443297.1 pep chromosome:GRCh38:14:21782993:21783503:1 gene:ENSG00000211781.3 transcript:ENST00000390429.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV7 description:T cell receptor alpha variable 7 [Source:HGNC Symbol;Acc:HGNC:12145]
+MEKMRRPVLIIFCLCLGWANGENQVEHSPHFLGPQQGDVASMSCTYSVSRFNNLQWYRQN
+TGMGPKHLLSMYSAGYEKQKGRLNATLLKNGSSLYITAVQPEDSATYFCAVD
+>ENSP00000443059.1 pep chromosome:GRCh38:14:21797287:21797886:1 gene:ENSG00000211782.2 transcript:ENST00000390430.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-1 description:T cell receptor alpha variable 8-1 [Source:HGNC Symbol;Acc:HGNC:12146]
+MLLLLIPVLGMIFALRDARAQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPG
+QHLQLLLKYFSGDPLVKGIKGFEAEFIKSKFSFNLRKPSVQWSDTAEYFCAVN
+>ENSP00000438446.1 pep chromosome:GRCh38:14:21811502:21811977:1 gene:ENSG00000211783.3 transcript:ENST00000390431.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV9-1 description:T cell receptor alpha variable 9-1 [Source:HGNC Symbol;Acc:HGNC:12153]
+MNSSPGPAIALFLMFGGINGDSVVQTEGQVLPSEGDSLIVNCSYETTQYPSLFWYVQYPG
+EGPQLHLKAMKANDKGRNKGFEAMYRKETTSFHLEKDSVQESDSAVYFCALS
+>ENSP00000440313.1 pep chromosome:GRCh38:14:21825472:21826075:1 gene:ENSG00000211784.2 transcript:ENST00000390432.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV10 description:T cell receptor alpha variable 10 [Source:HGNC Symbol;Acc:HGNC:12103]
+MKKHLTTFLVILWLYFYRGNGKNQVEQSPQSLIILEGKNCTLQCNYTVSPFSNLRWYKQD
+TGRGPVSLTIMTFSENTKSNGRYTATLDADTKQSSLHITASQLSDSASYICVVS
+>ENSP00000445405.1 pep chromosome:GRCh38:14:21841240:21841774:1 gene:ENSG00000211785.1 transcript:ENST00000390433.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV12-1 description:T cell receptor alpha variable 12-1 [Source:HGNC Symbol;Acc:HGNC:12105]
+MISLRVLLVILWLQLSWVWSQRKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQD
+CRKEPKLLMSVYSSGNEDGRFTAQLNRASQYISLLIRDSKLSDSATYLCVVN
+>ENSP00000439323.1 pep chromosome:GRCh38:14:21846537:21847221:1 gene:ENSG00000211786.3 transcript:ENST00000390434.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-2 description:T cell receptor alpha variable 8-2 [Source:HGNC Symbol;Acc:HGNC:12147]
+MLLLLVPVLEVIFTLGGTRAQSVTQLDSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPN
+KGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVVS
+>ENSP00000440087.1 pep chromosome:GRCh38:14:21852558:21853006:1 gene:ENSG00000211787.1 transcript:ENST00000390435.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-3 description:T cell receptor alpha variable 8-3 [Source:HGNC Symbol;Acc:HGNC:12148]
+MLLELIPLLGIHFVLRTARAQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPG
+QGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVG
+>ENSP00000441696.1 pep chromosome:GRCh38:14:21868839:21869365:1 gene:ENSG00000211788.2 transcript:ENST00000390436.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV13-1 description:T cell receptor alpha variable 13-1 [Source:HGNC Symbol;Acc:HGNC:12108]
+MTSIRAVFIFLWLQLDLVNGENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELG
+KGPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLHITETQPEDSAVYFCAAS
+>ENSP00000437362.1 pep chromosome:GRCh38:14:21887857:21888502:1 gene:ENSG00000211789.2 transcript:ENST00000390437.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV12-2 description:T cell receptor alpha variable 12-2 [Source:HGNC Symbol;Acc:HGNC:12106]
+MKSLRVLLVILWLQLSWVWSQQKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQY
+SGKSPELIMFIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVN
+>ENSP00000445942.1 pep chromosome:GRCh38:14:21894433:21895030:1 gene:ENSG00000211790.2 transcript:ENST00000390438.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-4 description:T cell receptor alpha variable 8-4 [Source:HGNC Symbol;Acc:HGNC:12149]
+MLLLLVPVLEVIFTLGGTRAQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPN
+QGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
+>ENSP00000438480.1 pep chromosome:GRCh38:14:21918188:21918756:1 gene:ENSG00000211791.2 transcript:ENST00000390439.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV13-2 description:T cell receptor alpha variable 13-2 [Source:HGNC Symbol;Acc:HGNC:12109]
+MAGIRALFMYLWLQLDWVSRGESVGLHLPTLSVQEGDNSIINCAYSNSASDYFIWYKQES
+GKGPQFIIDIRSNMDKRQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYFCAEN
+>ENSP00000446015.1 pep chromosome:GRCh38:14:21924063:21924651:1 gene:ENSG00000211792.2 transcript:ENST00000390440.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV14DV4 description:T cell receptor alpha variable 14/delta variable 4 [Source:HGNC Symbol;Acc:HGNC:12110]
+MSLSSLLKVVTASLWLGPGIAQKITQTQPGMFVQEKEAVTLDCTYDTSDPSYGLFWYKQP
+SSGEMIFLIYQGSYDQQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAMRE
+>ENSP00000452011.1 pep chromosome:GRCh38:14:21941128:21941657:1 gene:ENSG00000211793.2 transcript:ENST00000390441.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV9-2 description:T cell receptor alpha variable 9-2 [Source:HGNC Symbol;Acc:HGNC:12154]
+MNYSPGLVSLILLLLGRTRGDSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPG
+EGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCALS
+>ENSP00000451822.1 pep chromosome:GRCh38:14:21965451:21966061:1 gene:ENSG00000211794.3 transcript:ENST00000390442.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV12-3 description:T cell receptor alpha variable 12-3 [Source:HGNC Symbol;Acc:HGNC:12107]
+MMKSLRVLLVILWLQLSWVWSQQKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQ
+YSRKGPELLMYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMS
+>ENSP00000450505.1 pep chromosome:GRCh38:14:21978459:21979120:1 gene:ENSG00000211795.3 transcript:ENST00000390443.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-6 description:T cell receptor alpha variable 8-6 [Source:HGNC Symbol;Acc:HGNC:12151]
+MLLLLVPAFQVIFTLGGTRAQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPN
+QGLQLLLKYLSGSTLVESINGFEAEFNKSQTSFHLRKPSVHISDTAEYFCAVS
+>ENSP00000451359.1 pep chromosome:GRCh38:14:21990496:21990938:1 gene:ENSG00000211796.1 transcript:ENST00000390444.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV16 description:T cell receptor alpha variable 16 [Source:HGNC Symbol;Acc:HGNC:12112]
+MKPTLISVLVIIFILRGTRAQRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSR
+QRLQLLLRHISRESIKGFTADLNKGETSFHLKKPFAQEEDSAMYYCALS
+>ENSP00000452087.1 pep chromosome:GRCh38:14:21997539:21998168:1 gene:ENSG00000211797.2 transcript:ENST00000390445.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV17 description:T cell receptor alpha variable 17 [Source:HGNC Symbol;Acc:HGNC:12113]
+METLLGVSLVILWLQLARVNSQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSG
+RGLVHLILIRSNEREKHSGRLRVTLDTSKKSSSLLITASRAADTASYFCATD
+>ENSP00000451574.1 pep chromosome:GRCh38:14:22003106:22003673:1 gene:ENSG00000211798.3 transcript:ENST00000390446.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV18 description:T cell receptor alpha variable 18 [Source:HGNC Symbol;Acc:HGNC:12114]
+MLSASCSGLVILLIFRRTSGDSVTQTEGPVTLPERAALTLNCTYQSSYSTFLFWYVQYLN
+KEPELLLKSSENQETDSRGFQASPIKSDSSFHLEKPSVQLSDSAVYYCALR
+>ENSP00000452148.1 pep chromosome:GRCh38:14:22007512:22008181:1 gene:ENSG00000211799.3 transcript:ENST00000390447.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV19 description:T cell receptor alpha variable 19 [Source:HGNC Symbol;Acc:HGNC:12115]
+MLTASLLRAVIASICVVSSMAQKVTQAQTEISVVEKEDVTLDCVYETRDTTYYLFWYKQP
+PSGELVFLIRRNSFDEQNEISGRYSWNFQKSTSSFNFTITASQVVDSAVYFCALSE
+>ENSP00000452067.1 pep chromosome:GRCh38:14:22040594:22041153:1 gene:ENSG00000211800.3 transcript:ENST00000390448.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV20 description:T cell receptor alpha variable 20 [Source:HGNC Symbol;Acc:HGNC:12117]
+MEKMLECAFIVLWLQLGWLSGEDQVTQSPEALRLQEGESSSLNCSYTVSGLRGLFWYRQD
+PGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCAVQ
+>ENSP00000452526.1 pep chromosome:GRCh38:14:22052514:22053056:1 gene:ENSG00000211801.3 transcript:ENST00000390449.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV21 description:T cell receptor alpha variable 21 [Source:HGNC Symbol;Acc:HGNC:12118]
+METLLGLLILWLQLQWVSSKQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPG
+KGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCAVR
+>ENSP00000452420.1 pep chromosome:GRCh38:14:22070557:22071208:1 gene:ENSG00000211802.3 transcript:ENST00000390450.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV22 description:T cell receptor alpha variable 22 [Source:HGNC Symbol;Acc:HGNC:12119]
+MKRILGALLGLLSAQVCCVRGIQVEQSPPDLILQEGANSTLRCNFSDSVNNLQWFHQNPW
+GQLINLFYIPSGTKQNGRLSATTVATERYSLLYISSSQTTDSGVYFCAVE
+>ENSP00000451203.1 pep chromosome:GRCh38:14:22086407:22086961:1 gene:ENSG00000211803.2 transcript:ENST00000390451.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV23DV6 description:T cell receptor alpha variable 23/delta variable 6 [Source:HGNC Symbol;Acc:HGNC:12120]
+MDKILGASFLVLWLQLCWVSGQQKEKSDQQQVKQSPQSLIVQKGGISIINCAYENTAFDY
+FPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSSHIMDSQPGDSATYFCAA
+S
+>ENSP00000452111.1 pep chromosome:GRCh38:14:22096032:22096619:1 gene:ENSG00000211804.3 transcript:ENST00000390452.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRDV1 description:T cell receptor delta variable 1 [Source:HGNC Symbol;Acc:HGNC:12262]
+MLFSSLLCVFVAFSYSGSSVAQKVTQAQSSVSMPVRKAVTLNCLYETSWWSYYIFWYKQL
+PSKEMIFLIRQGSDEQNAKSGRYSVNFKKAAKSVALTISALQLEDSAKYFCALGE
+>ENSP00000484940.1 pep chromosome:GRCh38:14:22096507:22096608:1 gene:ENSG00000211804.3 transcript:ENST00000621643.1 gene_biotype:TR_V_gene transcript_biotype:protein_coding gene_symbol:TRDV1 description:T cell receptor delta variable 1 [Source:HGNC Symbol;Acc:HGNC:12262]
+KSGRYSVNFKKAAKSVALTISALQLEDSAKYFCA
+>ENSP00000451837.1 pep chromosome:GRCh38:14:22105343:22105846:1 gene:ENSG00000211805.1 transcript:ENST00000390453.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV24 description:T cell receptor alpha variable 24 [Source:HGNC Symbol;Acc:HGNC:12121]
+MEKNPLAAPLLILWFHLDCVSSILNVEQSPQSLHVQEGDSTNFTCSFPSSNFYALHWYRW
+ETAKSPEALFVMTLNGDEKKKGRISATLNTKEGYSYLYIKGSQPEDSATYLCAF
+>ENSP00000452100.1 pep chromosome:GRCh38:14:22112347:22113031:1 gene:ENSG00000211806.2 transcript:ENST00000390454.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV25 description:T cell receptor alpha variable 25 [Source:HGNC Symbol;Acc:HGNC:12122]
+MLLITSMLVLWMQLSQVNGQQVMQIPQYQHVQEGEDFTTYCNSSTTLSNIQWYKQRPGGH
+PVFLIQLVKSGEVKKQKRLTFQFGEAKKNSSLHITATQTTDVGTYFCAG
+>ENSP00000452431.1 pep chromosome:GRCh38:14:22123318:22124285:1 gene:ENSG00000211807.3 transcript:ENST00000390455.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV26-1 description:T cell receptor alpha variable 26-1 [Source:HGNC Symbol;Acc:HGNC:12123]
+MRLVARVTVFLTFGTIIDAKTTQPTSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQG
+PQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVRV
+>ENSP00000450448.1 pep chromosome:GRCh38:14:22132553:22133034:1 gene:ENSG00000211808.3 transcript:ENST00000390456.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV8-7 description:T cell receptor alpha variable 8-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12152]
+MLLVVILLLGMFFTLSKTQSVTQLDGHITVSEEAPLELKCNYSYSGVPSLFWYVQYSSQS
+LQLLLKDLTKATQVKGIRGFEAEFKKSETSFYLRKPSTHVSDAAEYFCAVGDR
+>ENSP00000451735.1 pep chromosome:GRCh38:14:22147995:22148633:1 gene:ENSG00000211809.2 transcript:ENST00000390457.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV27 description:T cell receptor alpha variable 27 [Source:HGNC Symbol;Acc:HGNC:12125]
+MVLKFSVSILWIQLAWVSTQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEG
+PVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQTGDTGLYLCAG
+>ENSP00000452209.1 pep chromosome:GRCh38:14:22163238:22163870:1 gene:ENSG00000211810.3 transcript:ENST00000390458.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV29DV5 description:T cell receptor alpha variable 29/delta variable 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12127]
+MAMLLGASVLILWLQPDWVNSQQKNDDQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFL
+WYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYFCAAS
+>ENSP00000451308.1 pep chromosome:GRCh38:14:22168429:22168988:1 gene:ENSG00000259092.1 transcript:ENST00000557168.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV30 description:T cell receptor alpha variable 30 [Source:HGNC Symbol;Acc:HGNC:12129]
+METLLKVLSGTLLWQLTWVRSQQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHG
+EAPVFLMILLKGGEQKGHDKISASFNEKKQQSSLYLTASQLSYSGTYFCGTE
+>ENSP00000450865.1 pep chromosome:GRCh38:14:22304054:22304553:1 gene:ENSG00000211818.1 transcript:ENST00000390466.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV39 description:T cell receptor alpha variable 39 [Source:HGNC Symbol;Acc:HGNC:12139]
+MKKLLAMILWLQLDRLSGELKVEQNPLFLSMQEGKNYTIYCNYSTTSDRLYWYRQDPGKS
+LESLFVLLSNGAVKQEGRLMASLDTKARLSTLHITAAVHDLSATYFCAVD
+>ENSP00000452002.1 pep chromosome:GRCh38:14:22207522:22208129:1 gene:ENSG00000211813.2 transcript:ENST00000390461.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV34 description:T cell receptor alpha variable 34 [Source:HGNC Symbol;Acc:HGNC:12133]
+METVLQVLLGILGFQAAWVSSQELEQSPQSLIVQEGKNLTINCTSSKTLYGLYWYKQKYG
+EGLIFLMMLQKGGEEKSHEKITAKLDEKKQQSSLHITASQPSHAGIYLCGAD
+>ENSP00000452585.1 pep chromosome:GRCh38:14:22314490:22314919:1 gene:ENSG00000211819.3 transcript:ENST00000390467.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV40 description:T cell receptor alpha variable 40 [Source:HGNC Symbol;Acc:HGNC:12141]
+MNSSLDFLILILMFGGTSSNSVKQTGQITVSEGASVTMNCTYTSTGYPTLFWYVEYPSKP
+LQLLQRETMENSKNFGGGNIKDKNSPIVKYSVQVSDSAVYYCLLG
+>ENSP00000450950.1 pep chromosome:GRCh38:14:22271968:22272563:1 gene:ENSG00000211816.2 transcript:ENST00000390464.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV38-1 description:T cell receptor alpha variable 38-1 [Source:HGNC Symbol;Acc:HGNC:12137]
+MTRVSLLWAVVVSTCLESGMAQTVTQSQPEMSVQEAETVTLSCTYDTSENNYYLFWYKQP
+PSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCAFMK
+>ENSP00000451177.1 pep chromosome:GRCh38:14:22320188:22320691:1 gene:ENSG00000211820.1 transcript:ENST00000390468.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV41 description:T cell receptor alpha variable 41 [Source:HGNC Symbol;Acc:HGNC:12142]
+MVKIRQFLLAILWLQLSCVSAAKNEVEQSPQNLTAQEGEFITINCSYSVGISALHWLQQH
+PGGGIVSLFMLSSGKKKHGRLIATINIQEKHSSLHITASHPRDSAVYICAVR
+>ENSP00000452332.1 pep chromosome:GRCh38:14:22281105:22281748:1 gene:ENSG00000211817.2 transcript:ENST00000390465.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV38-2DV8 description:T cell receptor alpha variable 38-2/delta variable 8 [Source:HGNC Symbol;Acc:HGNC:12138]
+MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYDTSESDYYLFWYKQP
+PSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDAAMYFCAYRS
+>ENSP00000450970.1 pep chromosome:GRCh38:14:22202583:22203368:1 gene:ENSG00000211812.1 transcript:ENST00000390460.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV26-2 description:T cell receptor alpha variable 26-2 [Source:HGNC Symbol;Acc:HGNC:12124]
+MKLVTSITVLLSLGIMGDAKTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQG
+PEYVIHGLTSNVNNRMASLAIAEDRKSSTLILHRATLRDAAVYYCILRD
+>ENSP00000451750.1 pep chromosome:GRCh38:14:22469041:22469698:-1 gene:ENSG00000256590.2 transcript:ENST00000535880.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRDV3 description:T cell receptor delta variable 3 [Source:HGNC Symbol;Acc:HGNC:12264]
+MILTVGFSFLFFYRGTLCDKVTQSSPDQTVASGSEVVLLCTYDTVYSNPDLFWYRIRPDY
+SFQFVFYGDNSRSEGADFTQGRFSVKHILTQKAFHLVISPVRTEDSATYYCAF
+>ENSP00000451578.1 pep chromosome:GRCh38:14:22422371:22423042:1 gene:ENSG00000211821.2 transcript:ENST00000390469.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRDV2 description:T cell receptor delta variable 2 [Source:HGNC Symbol;Acc:HGNC:12263]
+MQRISSLIHLSLFWAGVMSAIELVPEHQTVPVSIGVPATLRCSMKGEAIGNYYINWYRKT
+QGNTMTFIYREKDIYGPGFKDNFQGDIDIAKNLAVLKILAPSERDEGSYYCACDT
+>ENSP00000450804.1 pep chromosome:GRCh38:14:22226746:22227254:1 gene:ENSG00000211815.3 transcript:ENST00000390463.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRAV36DV7 description:T cell receptor alpha variable 36/delta variable 7 [Source:HGNC Symbol;Acc:HGNC:12135]
+MMKCPQALLAIFWLLLSWVSSEDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQE
+KKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAIYLCAVE
+>ENSP00000388523.3 pep chromosome:GRCh38:7:142300924:142301432:1 gene:ENSG00000226660.2 transcript:ENST00000455382.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV2 description:T cell receptor beta variable 2 [Source:HGNC Symbol;Acc:HGNC:12195]
+MDTWLVCWAIFSLLKAGLTEPEVTQTPSHQVTQMGQEVILRCVPISNHLYFYWYRQILGQ
+KVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE
+>ENSP00000374910.3 pep chromosome:GRCh38:7:142308542:142309048:1 gene:ENSG00000237702.2 transcript:ENST00000390387.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV3-1 description:T cell receptor beta variable 3-1 [Source:HGNC Symbol;Acc:HGNC:12212]
+MGCRLLCCVVFCLLQAGPLDTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKK
+FLKIMFSYNNKELIINETVPNRFSPKSPDKAHLNLHINSLELGDSAVYFCASSQ
+>ENSP00000374880.3 pep chromosome:GRCh38:7:142313184:142313666:1 gene:ENSG00000211710.3 transcript:ENST00000390357.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-1 description:T cell receptor beta variable 4-1 [Source:HGNC Symbol;Acc:HGNC:12215]
+MGCRLLCCAVLCLLGAVPIDTEVTQTPKHLVMGMTNKKSLKCEQHMGHRAMYWYKQKAKK
+PPELMFVYSYEKLSINESVPSRFSPECPNSSLLNLHLHALQPEDSALYLCASSQ
+>ENSP00000374904.3 pep chromosome:GRCh38:7:142320677:142321544:1 gene:ENSG00000211734.3 transcript:ENST00000390381.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-1 description:T cell receptor beta variable 5-1 [Source:HGNC Symbol;Acc:HGNC:12218]
+MGSRLLCWVLLCLLGAGPVKAGVTQTPRYLIKTRGQQVTLSCSPISGHRSVSWYQQTPGQ
+GLQFLFEYFSETQRNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYLCASSL
+>ENSP00000374876.2 pep chromosome:GRCh38:7:142328297:142328786:1 gene:ENSG00000211706.2 transcript:ENST00000390353.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-1 description:T cell receptor beta variable 6-1 [Source:HGNC Symbol;Acc:HGNC:12226]
+MSIGLLCCVAFSLLWASPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHNSMYWYRQDPGM
+GLRLIYYSASEGTTDKGEVPNGYNVSRLNKREFSLRLESAAPSQTSVYFCASSE
+>ENSP00000448600.2 pep chromosome:GRCh38:7:142332182:142332701:1 gene:ENSG00000211707.3 transcript:ENST00000547918.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-1 description:T cell receptor beta variable 7-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12235]
+MGTRLLCWAAICLLGADHTGAGVSQSLRHKVAKKGKDVALRYDPISGHNALYWYRQSLGQ
+GLEFPIYFQGKDAADKSGLPRDRFSAQRSEGSISTLKFQRTQQGDLAVYLCASSS
+>ENSP00000374915.3 pep chromosome:GRCh38:7:142345421:142345985:1 gene:ENSG00000211745.3 transcript:ENST00000390392.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV4-2 description:T cell receptor beta variable 4-2 [Source:HGNC Symbol;Acc:HGNC:12216]
+MGCRLLCCAVLCLLGAVPMETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKK
+PLELMFVYNFKEQTENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
+>ENSP00000488969.1 pep chromosome:GRCh38:7:142349152:142349664:1 gene:ENSG00000283063.1 transcript:ENST00000634383.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-2 description:T cell receptor beta variable 6-2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12227]
+MSLGLLCCGAFSLLWAGPVNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGM
+GLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
+>ENSP00000489072.1 pep chromosome:GRCh38:7:142352819:142353358:1 gene:ENSG00000282939.1 transcript:ENST00000634605.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-2 description:T cell receptor beta variable 7-2 [Source:HGNC Symbol;Acc:HGNC:12236]
+MGTRLLFWVAFCLLGADHTGAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQ
+GLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL
+>ENSP00000374883.3 pep chromosome:GRCh38:7:142380806:142381261:1 gene:ENSG00000211713.3 transcript:ENST00000390360.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-4 description:T cell receptor beta variable 6-4 [Source:HGNC Symbol;Acc:HGNC:12229]
+MSIRLLCCVAFSLLWAGPVTAGITQAPTSQILAAGRSMTLRCTQDMRHNAMYWYRQDLGL
+GLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD
+>ENSP00000374884.3 pep chromosome:GRCh38:7:142384329:142384841:1 gene:ENSG00000211714.3 transcript:ENST00000390361.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-3 description:T cell receptor beta variable 7-3 [Source:HGNC Symbol;Acc:HGNC:12237]
+MGTRLLCWAALCLLGADHTGAGVSQTPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQ
+GPEFLIYFQGTGAADDSGLPKDRFFAVRPEGSVSTLKIQRTEQGDSAAYLRASSL
+>ENSP00000374885.1 pep chromosome:GRCh38:7:142389202:142389668:1 gene:ENSG00000211715.1 transcript:ENST00000390362.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-3 description:T cell receptor beta variable 5-3 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12220]
+MGPGLLCWELLYLLGAGPVEAGVTQSPTHLIKTRGQQVTLRCSPISGHSSVSWYQQAPGQ
+GPQFIFEYANELRRSEGNFPNRFSGRQFHDYCSEMNVSALELGDSALYLCARSL
+>ENSP00000374886.2 pep chromosome:GRCh38:7:142391891:142392412:1 gene:ENSG00000211716.2 transcript:ENST00000390363.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV9 description:T cell receptor beta variable 9 [Source:HGNC Symbol;Acc:HGNC:12246]
+MGFRLLCCVAFCLLGAGPVDSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQ
+GLQFLIHYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV
+>ENSP00000374887.3 pep chromosome:GRCh38:7:142399860:142400377:1 gene:ENSG00000211717.3 transcript:ENST00000390364.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-1 description:T cell receptor beta variable 10-1(gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12177]
+MGTRLFFYVALCLLWAGHRDAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGH
+GLRLIHYSYGVHDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE
+>ENSP00000374890.3 pep chromosome:GRCh38:7:142407672:142408136:1 gene:ENSG00000211720.3 transcript:ENST00000390367.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-1 description:T cell receptor beta variable 11-1 [Source:HGNC Symbol;Acc:HGNC:12180]
+MSTRLLCWMALCLLGAELSEAEVAQSPRYKITEKSQAVAFWCDPISGHATLYWYRQILGQ
+GPELLVQFQDESVVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAMYLCASSL
+>ENSP00000404652.2 pep chromosome:GRCh38:7:142424965:142425465:1 gene:ENSG00000229769.2 transcript:ENST00000426318.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-2 description:T cell receptor beta variable 10-2 [Source:HGNC Symbol;Acc:HGNC:12178]
+MGTRLFFYVALCLLWAGHRDAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGH
+GLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE
+>ENSP00000374891.2 pep chromosome:GRCh38:7:142450947:142451448:1 gene:ENSG00000211721.2 transcript:ENST00000390368.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-5 description:T cell receptor beta variable 6-5 [Source:HGNC Symbol;Acc:HGNC:12230]
+MSIGLLCCAALSLLWAGPVNAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGM
+GLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSY
+>ENSP00000374892.2 pep chromosome:GRCh38:7:142455174:142455635:1 gene:ENSG00000253409.1 transcript:ENST00000390369.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-4 description:T cell receptor beta variable 7-4 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12238]
+MGTRLLCWVVLGFLGTDHTGAGVSQSPRYKVAKRGRDVALRCDSISGHVTLYWYRQTLGQ
+GSEVLTYSQSDAQRDKSGRPSGRFSAERPERSVSTLKIQRTEQGDSAVYLCASSL
+>ENSP00000413966.2 pep chromosome:GRCh38:7:142462916:142463581:1 gene:ENSG00000230099.2 transcript:ENST00000454561.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-4 description:T cell receptor beta variable 5-4 [Source:HGNC Symbol;Acc:HGNC:12221]
+MGPGLLCWALLCLLGAGSVETGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQ
+GPQFIFQYYREEENGRGNFPPRFSGLQFPNYSSELNVNALELDDSALYLCASSL
+>ENSP00000374894.3 pep chromosome:GRCh38:7:142469537:142470013:1 gene:ENSG00000211724.3 transcript:ENST00000390371.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-6 description:T cell receptor beta variable 6-6 [Source:HGNC Symbol;Acc:HGNC:12231]
+MSISLLCCAAFPLLWAGPVNAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGM
+GLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSY
+>ENSP00000374895.3 pep chromosome:GRCh38:7:142482548:142483019:1 gene:ENSG00000211725.3 transcript:ENST00000390372.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-5 description:T cell receptor beta variable 5-5 [Source:HGNC Symbol;Acc:HGNC:12222]
+MGPGLLCWVLLCLLGAGPVDAGVTQSPTHLIKTRGQQVTLRCSPISGHKSVSWYQQVLGQ
+GPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASSL
+>ENSP00000374896.2 pep chromosome:GRCh38:7:142487863:142488295:1 gene:ENSG00000253188.1 transcript:ENST00000390373.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-7 description:T cell receptor beta variable 6-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12232]
+MSLGLLCCVAFSLLWAGPMNAGVTQTPKFHVLKTGQSMTLLCAQDMNHEYMYRYRQDPGK
+GLRLIYYSVAAALTDKGEVPNGYNVSRSNTEDFPLKLESAAPSQTSVYFCASSY
+>ENSP00000374897.3 pep chromosome:GRCh38:7:142492132:142492673:1 gene:ENSG00000211727.3 transcript:ENST00000390374.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-6 description:T cell receptor beta variable 7-6 [Source:HGNC Symbol;Acc:HGNC:12240]
+MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQ
+GPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
+>ENSP00000374898.2 pep chromosome:GRCh38:7:142500028:142500534:1 gene:ENSG00000211728.2 transcript:ENST00000390375.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-6 description:T cell receptor beta variable 5-6 [Source:HGNC Symbol;Acc:HGNC:12223]
+MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKSGHDTVSWYQQALGQ
+GPQFIFQYYEEEERQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYLCASSL
+>ENSP00000374899.2 pep chromosome:GRCh38:7:142507382:142507810:1 gene:ENSG00000253534.1 transcript:ENST00000390376.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV6-8 description:T cell receptor beta variable 6-8 [Source:HGNC Symbol;Acc:HGNC:12233]
+MSLGLLCCAAFSLLWAGPVNAGVTQTPKFHILKTGQSMTLQCAQDMNHGYMSWYRQDPGM
+GLRLIYYSAAAGTTDKEVPNGYNVSRLNTEDFPLRLVSAAPSQTSVYLCASSY
+>ENSP00000374900.1 pep chromosome:GRCh38:7:142511626:142512127:1 gene:ENSG00000253291.1 transcript:ENST00000390377.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-7 description:T cell receptor beta variable 7-7 [Source:HGNC Symbol;Acc:HGNC:12241]
+MGTSLLCWVVLGFLGTDHTGAGVSQSPRYKVTKRGQDVTLRCDPISSHATLYWYQQALGQ
+GPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
+>ENSP00000374901.1 pep chromosome:GRCh38:7:142520090:142520556:1 gene:ENSG00000211731.1 transcript:ENST00000390378.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV5-7 description:T cell receptor beta variable 5-7 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12224]
+MGPGLLCWVLLCPLGEGPVDAGVTQSPTHLIKTRGQHVTLRCSPISGHTSVSSYQQALGQ
+GPQFIFQYYEKEERGRGNFPDQFSGHQFPNYSSELNVNALLLGDSALYLCASSL
+>ENSP00000478301.1 pep chromosome:GRCh38:7:142529290:142529762:1 gene:ENSG00000278030.1 transcript:ENST00000612787.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV7-9 description:T cell receptor beta variable 7-9 [Source:HGNC Symbol;Acc:HGNC:12243]
+MGTSLLCWMALCLLGADHADTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQ
+GPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
+>ENSP00000477580.1 pep chromosome:GRCh38:7:142535809:142536292:1 gene:ENSG00000276405.1 transcript:ENST00000614171.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV13 description:T cell receptor beta variable 13 [Source:HGNC Symbol;Acc:HGNC:12188]
+MLSPDLPDSAWNTRLLCRVMLCLLGAGSVAAGVIQSPRHLIKEKRETATLKCYPIPRHDT
+VYWYQQGPGQDPQFLISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFC
+ASSL
+>ENSP00000479267.1 pep chromosome:GRCh38:7:142544212:142544685:1 gene:ENSG00000275791.1 transcript:ENST00000611462.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV10-3 description:T cell receptor beta variable 10-3 [Source:HGNC Symbol;Acc:HGNC:12179]
+MGTRLFFYVALCLLWTGHMDAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGH
+GLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE
+>ENSP00000480080.1 pep chromosome:GRCh38:7:142554836:142555318:1 gene:ENSG00000276597.1 transcript:ENST00000611787.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV11-3 description:T cell receptor beta variable 11-3 [Source:HGNC Symbol;Acc:HGNC:12182]
+MGTRLLCWVAFCLLVEELIEAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYLQNLGQ
+GPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL
+>ENSP00000477916.1 pep chromosome:GRCh38:7:142560423:142560931:1 gene:ENSG00000274752.1 transcript:ENST00000620569.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-3 description:T cell receptor beta variable 12-3 [Source:HGNC Symbol;Acc:HGNC:12185]
+MDSWTFCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMR
+GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
+>ENSP00000480999.1 pep chromosome:GRCh38:7:142563740:142564245:1 gene:ENSG00000276953.1 transcript:ENST00000617347.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-4 description:T cell receptor beta variable 12-4 [Source:HGNC Symbol;Acc:HGNC:12186]
+MGSWTLCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMR
+GLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
+>ENSP00000479506.1 pep chromosome:GRCh38:7:142580917:142581427:1 gene:ENSG00000275158.1 transcript:ENST00000621184.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV12-5 description:T cell receptor beta variable 12-5 [Source:HGNC Symbol;Acc:HGNC:12187]
+MATRLLCCVVLCLLGEELIDARVTQTPRHKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQ
+GLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGL
+>ENSP00000477671.1 pep chromosome:GRCh38:7:142587868:142588359:1 gene:ENSG00000275743.1 transcript:ENST00000617639.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV14 description:T cell receptor beta variable 14 [Source:HGNC Symbol;Acc:HGNC:12189]
+MVSRLLSLVSLCLLGAKHIEAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGK
+EIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQ
+>ENSP00000482333.1 pep chromosome:GRCh38:7:142592928:142593473:1 gene:ENSG00000276819.1 transcript:ENST00000616518.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV15 description:T cell receptor beta variable 15 [Source:HGNC Symbol;Acc:HGNC:12190]
+MGPGLLHWMALCLLGTGHGDAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQ
+APKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYLCATSR
+>ENSP00000479210.1 pep chromosome:GRCh38:7:142598016:142598469:1 gene:ENSG00000275243.1 transcript:ENST00000620773.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV16 description:T cell receptor beta variable 16 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12191]
+MSPIFTCITILCLLAAGSPGEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKN
+EFKFLISFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASSQ
+>ENSP00000483468.1 pep chromosome:GRCh38:7:142601628:142602360:1 gene:ENSG00000277880.1 transcript:ENST00000619103.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV17 description:T cell receptor beta variable 17 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12192]
+MDIWLLCWVTLCLLAAGHSEPGVSQTPRHKVTNMGQEVILRCDPSSGHMFVHWYRQNLRQ
+EMKLLISFQYQNIAVDSGMPKERFTAERPNGTSSTLKIHPAEPRDSAVYLYSSG
+>ENSP00000483504.1 pep chromosome:GRCh38:7:142615716:142616415:1 gene:ENSG00000276557.1 transcript:ENST00000611520.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV18 description:T cell receptor beta variable 18 [Source:HGNC Symbol;Acc:HGNC:12193]
+MDTRLLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEE
+GLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSP
+>ENSP00000374916.3 pep chromosome:GRCh38:7:142618849:142619532:1 gene:ENSG00000211746.3 transcript:ENST00000390393.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV19 description:T cell receptor beta variable 19 [Source:HGNC Symbol;Acc:HGNC:12194]
+MSNQVLCCVVLCLLGANTVDGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQ
+GLRLIYYSQIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI
+>ENSP00000374917.3 pep chromosome:GRCh38:7:142626649:142627399:1 gene:ENSG00000211747.3 transcript:ENST00000390394.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV20-1 description:T cell receptor beta variable 20-1 [Source:HGNC Symbol;Acc:HGNC:12196]
+MLLLLLLLGPGSGLGAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML
+MATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
+>ENSP00000374919.1 pep chromosome:GRCh38:7:142645961:142646467:1 gene:ENSG00000211749.1 transcript:ENST00000390396.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV23-1 description:T cell receptor beta variable 23-1 (non-functional) [Source:HGNC Symbol;Acc:HGNC:12201]
+MGTRLLGCAALCLLAADSFHAKVTQTPGHLVKGKGQKTKMDCTPEKGHTFVYWYQQNQNK
+EFMLLISFQNEQVLQETEMHKKRFSSQCPKNAPCSLAILSSEPGDTALYLCASSQ
+>ENSP00000374920.2 pep chromosome:GRCh38:7:142656701:142657213:1 gene:ENSG00000211750.2 transcript:ENST00000390397.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV24-1 description:T cell receptor beta variable 24-1 [Source:HGNC Symbol;Acc:HGNC:12203]
+MASLLFFCGAFYLLGTGSMDADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGL
+GLQLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
+>ENSP00000374921.3 pep chromosome:GRCh38:7:142670740:142671244:1 gene:ENSG00000282499.1 transcript:ENST00000390398.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV25-1 description:T cell receptor beta variable 25-1 [Source:HGNC Symbol;Acc:HGNC:12205]
+MTIRLLCYVGFYFLGAGLMEADIYQTPRYLVIGTGKKITLECSQTMGHDKMYWYQQDPGM
+ELHLIHYSYGVNSTEKGDLSSESTVSRIRTEHFPLTLESARPSHTSQYLCASSE
+>ENSP00000374922.3 pep chromosome:GRCh38:7:142715346:142715861:1 gene:ENSG00000211752.3 transcript:ENST00000390399.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV27 description:T cell receptor beta variable 27 [Source:HGNC Symbol;Acc:HGNC:12208]
+MGPQLLGYVVLCLLGAGPLEAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGL
+GLRQIYYSMNVEVTDKGDVPEGYKVSRKEKRNFPLILESPSPNQTSLYFCASSL
+>ENSP00000397118.2 pep chromosome:GRCh38:7:142812586:142813399:-1 gene:ENSG00000237254.2 transcript:ENST00000417977.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV30 description:T cell receptor beta variable 30 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:12214]
+MLCSLLALLLGTFFGVRSQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRG
+LQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS
+>ENSP00000374923.2 pep chromosome:GRCh38:7:142720660:142721160:1 gene:ENSG00000211753.4 transcript:ENST00000390400.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV28 description:T cell receptor beta variable 28 [Source:HGNC Symbol;Acc:HGNC:12209]
+MGIRLLCRVAFCFLAVGLVDVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGL
+GLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASSL
+>ENSP00000395459.2 pep chromosome:GRCh38:7:142740206:142740894:1 gene:ENSG00000232869.2 transcript:ENST00000422143.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:TRBV29-1 description:T cell receptor beta variable 29-1 [Source:HGNC Symbol;Acc:HGNC:12210]
+MLSLLLLLLGLGSVFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTL
+IATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVE
+>ENSP00000481428.1 pep chromosome:GRCh38:CHR_HSCHR19_4_CTG3_1:54840878:54856485:1 gene:ENSG00000273931.1 transcript:ENST00000610808.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:KIR2DS4 description:killer cell immunoglobulin like receptor, two Ig domains and short cytoplasmic tail 4 [Source:HGNC Symbol;Acc:HGNC:6336]
+MSLMVIIMACVGFFLLQGAWPQEGVHRKPSFLALPGHLVKSEETVILQCWSDVMFEHFLL
+HREGKFNNTLHLIGEHHDGVSKANFSIGPMMPVLAGTYRCYGSVPHSPYQLSAPSDPLDM
+VIIGLYEKPSLSAQPGPTVQAGENVTLSCSSIYPGKGRPMNVGSLQCAASTEHSRPTFLW
+ALPPTEGPTDASALSVTLPTSGQTRVIHCLFPSQETLQIVGLHPLNQAPKPVTPDTYMF*
+LGPQWSKSLSPSSSSFSFIAGAPTKKMLL*WTKSLQGTEQ*TARILMNKTIRRCHTH
+>ENSP00000492265.1 pep chromosome:GRCh38:8:43292483:43363319:1 gene:ENSG00000188877.12 transcript:ENST00000522175.7 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:POTEA description:POTE ankyrin domain family member A [Source:HGNC Symbol;Acc:HGNC:33893]
+MVAEVSPKLAASPMKKPFGFRGKMGKWCCCCFPCCRGSGKNNMGAWRDHDDSAFTEPRYH
+VRREDLGKLHRAAWWGEVPRADLIVMLRGPGINKRDKKKRTALHLACANGNSEVVSLLLD
+RQCQLHVFDSKKRTALIKAVQCQEDECALMLLQHGTDPNLPDMYGNTALHYAVYNEDKLM
+AKTLLLYGADIESKNKGGLTPLLLAVHGQKQRMVKFLIKKKANLNALDRFGRICQLLSDY
+KENQMPNNSSGNSNPEQDLKLTSEEEPQRLKGSENSQHEKVTQEPDINKDCDREVEEEMQ
+KHGSNNVGLSENLTDGAAAGNGDGGLVPQRKSRKHENQQFPNTEIEEYHRPEKKSNEKNK
+VKSQIHSVDNLDDITWPSEIASEDYDLLFSNYETFTLLIEQLKMDFNDSASLSKIQDAVI
+SEEHLLELKNSHYEQLTVEVEQMENMVHVLQK*LSEAKETQLQLAPQKGECEQERYSSSE
+EQNDTRKQLSKEQNARILQDEILTTKQKQIEVAEKKMNFEISLSHKEEKELLHENSMMQE
+EIAMLRIELDTIKHQNQLREKKYLEYIKSVKEKNDNLLKAIQLNEEALTKAVVQYSGQLS
+ILTTENKMLSFELQNVRHNNETLEMEIQSCHFRLATALHDCD
+>ENSP00000492193.1 pep chromosome:GRCh38:8:43292483:43363518:1 gene:ENSG00000188877.12 transcript:ENST00000519951.2 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:POTEA description:POTE ankyrin domain family member A [Source:HGNC Symbol;Acc:HGNC:33893]
+MVAEVSPKLAASPMKKPFGFRGKMGKWCCCCFPCCRGSGKNNMGAWRDHDDSAFTEPRYH
+VRREDLGKLHRAAWWGEVPRADLIVMLRGPGINKRDKKKRTALHLACANGNSEVVSLLLD
+RQCQLHVFDSKKRTALIKAVQCQEDECALMLLQHGTDPNLPDMYGNTALHYAVYNEDKLM
+AKTLLLYGADIESKNKGGLTPLLLAVHGQKQRMVKFLIKKKANLNALDRFGRTALILAVR
+CGSASIVSLLLQQNIDVFSQDVFGQTAEDYAVSSHHSIICQLLSDYKENQMPNNSSGNSN
+PEQDLKLTSEEEPQRLKGSENSQHEKVTQEPDINKDCDREVEEEMQKHGSNNVGLSENLT
+DGAAAGNGDGGLVPQRKSRKHENQQFPNTEIEEYHRPEKKSNEKNKVKSQIHSVDNLDDI
+TWPSEIASEDYDLLFSNYETFTLLIEQLKMDFNDSASLSKIQDAVISEEHLLELKNSHYE
+QLTVEVEQMENMVHVLQK*LSEAKETQLQLAPQKGECEQERYSSSEEQNDTRKQLSKEQN
+ARILQDEILTTKQKQIEVAEKKMNFEISLSHKEEKELLHENSMMQEEIAMLRIELDTIKH
+QNQLREKKYLEYIKSVKEKNDNLLKAIQLNEEALTKAVVQYSGQLSILTTENKMLSFELQ
+NVRHNNETLEMEIQSCHFRLATALHDCD
+>ENSP00000477333.1 pep chromosome:GRCh38:9:36002909:36003867:-1 gene:ENSG00000243641.3 transcript:ENST00000424348.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR13C7 description:olfactory receptor family 13 subfamily C member 7 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15102]
+MVSANQTASVTEFILLGLSAHPKLEKTFFVLILLMYLVILLGNGVLILMTVSNSHLHMPM
+YFFLGNLSFLDICYTTSSVPLILDSFLTPRKTISFSACAVQMFLSFAMGATECVLLSMMA
+FDRYVAICNPLRYPVVMSKAAYMPIRLPAPG*LEALLPWCRHPLQ*GCPSVETTSSTTSP
+VRFWLS*SWPVLISLSM*SVWE*PM*SSWGSRFCSSLSPMSSSLPPS*GSPQLRGGKRPS
+PPALPTSQSWSSSMGPSSSCMGSPSLRTRWGQTSKTLQTNSFPFSMGW*PPCSTPSSTA*
+GTRM*RLL*GT*YFRNALP
+>ENSP00000426627.1 pep chromosome:GRCh38:1:161581340:161599803:1 gene:ENSG00000244682.7 transcript:ENST00000466542.6 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:FCGR2C description:Fc fragment of IgG receptor IIc (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15626]
+MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAAPPKAVLKLEP*WIN
+VLQEDSVTLTCRGTHSPESDSIPWFHNGNLIPTHTQPSYRFKANNNDSGEYTCQTGQTSL
+SDPVHLTVLSEWLVLQTPHLEFQEGETIVLRCHSWKDKPLVKVTFFQNGKSKKFSRSDPN
+FSIPQANHSHSGDYHCTGNIGYTLYSSKPVTITVQAPSSSPMGIIVAVVTGIAVAAIVAA
+VVALIYCRKKRISANSTDPVKAAQFEPPGRQMIAIRKRQPEETNNDYETADGGYMTLNPR
+APTDDDKNIYLTLPPNDHVNSNN
+>ENSP00000480953.1 pep chromosome:GRCh38:1:161581436:161599828:1 gene:ENSG00000244682.7 transcript:ENST00000611236.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:FCGR2C description:Fc fragment of IgG receptor IIc (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15626]
+MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAAPPKAVLKLEPWINV
+LQEDSVTLTCRGTHSPESDSIPWFHNGNLIPTHTQPSYRFKANNNDSGEYTCQTGQTSLS
+DPVHLTVLSEWLVLQTPHLEFQEGETIVLRCHSWKDKPLVKVTFFQNGKSKKFSRSDPNF
+SIPQANHSHSGDYHCTGNIGYTLYSSKPVTITVQAPSSSPMGIIVAVVTGIAVAAIVAAV
+VALIYCRKKRISATWTSNDCHQKETT
+>ENSP00000444663.2 pep chromosome:GRCh38:1:161581339:161600242:1 gene:ENSG00000244682.7 transcript:ENST00000543859.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:FCGR2C description:Fc fragment of IgG receptor IIc (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15626]
+MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAAPPKAVLKLEPWINV
+LQEDSVTLTCRGTHSPESDSIPWFHNGNLIPTHTQPSYRFKANNNDSGEYTCQTGQTSLS
+DPVHLTVLSEWLVLQTPHLEFQEGETIVLRCHSWKDKPLVKVTFFQNGKSKKFSRSDPNF
+SIPQANHSHSGDYHCTGNIGYTLYSSKPVTITVQAPSSSPMGIIVAVVTGIAVAAIVAAV
+VALIYCRKKRISANSTDPVKAAQFEPPGRQMIAIRKRQPEETNNDYETADGGYMTLNPRA
+PTDDDKNIYLTLPPNDHVNSNN
+>ENSP00000427945.1 pep chromosome:GRCh38:8:141433829:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000523857.5 gene_biotype:polymorphic_pseudogene transcript_biotype:nonsense_mediated_decay gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976]
+MDRQCSERPYSCTPTGRVSSAVSQNSSHRLQDAAGHEQC
+>ENSP00000429433.1 pep chromosome:GRCh38:8:141433829:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000521053.5 gene_biotype:polymorphic_pseudogene transcript_biotype:nonsense_mediated_decay gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976]
+MDRQCSERPYSCTPTGRVSSAVSQNSSHRLQDAAGHEQC
+>ENSP00000431031.1 pep chromosome:GRCh38:8:141433832:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000430863.5 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976]
+MDRQCSERPYSCTPTGRVSSAVSQNSRISPPVSTSMKDSSCMKVHQDSARRDRWSHPTTI
+LLHKSQSSQATLMLQEHRMFMGEAYSAATGFKMLQDMNSADPFHLKYIIKKIKNMAHGSP
+KLVMETIHDYFIDNPEISSRHKFRLFQTLEMVIGASDVLEETWEKTFTRLALENMTKATE
+LEDIYQDAASNMLVAICRHSWRVVAQHLETELLTGVFPHRSLLYVMGVLSSSEELFSQED
+KACWEEQLIQMAIKSVPFLSTDVWSKELLWTLTTPSWTQQEQSPEKAFLFTYYGLILQAE
+KNGATVRRHLQALLETSHQWPKQREGMALTLGLAATRHLDDVWAVLDQFGRSRPIRWSLP
+SSSPKNSEDLRWKWASSTILLAYGQVAAKARAHILPWVDNIVSRMVFYFHYSSWDETLKQ
+SFLTATLMLMGAVSRSEGAHSYEFFQTSELLQCLMVLMEKEPQDTLCTRSRQQAMHIASS
+LCKLRPPIDLERKSQLLSTCFRSVFALPLLDALEKHTCLFLEPPNIQLWPVARERAGWTH
+QGWGPRAVLHCSEHLQSLYSRTMEALDFMLQSLIMQNPTADELHFLLSHLYIWLASEKAH
+ERQRAVHSCMILLKFLNHNGYLDPKEDFKRIGQLVGILGMLCQDPDRATQRCSLEGASHL
+YQLLMCHKTGEALQAESQAPKELSQAHSDGAPLWNSRDQKATPLGPQEMAKNHIFQLCSF
+QVIKDIMQQLTLAELSDLIWTAIDGLGSTSPFRVQAASEMLLTAVQEHGAKLEIVSSMAQ
+AIRLRLCSVHIPQAKEKTLHAITLLARSHTCELVATFLNISIPLDSHTFQLWRALGAGQP
+TSHLVLTTLLACLQERPLPTGASDSSPCPKEKTYLRLLAAMNMLHELQFAREFKQAVQEG
+YPKLFLALLTQMHYVLELNLPSEP*PKQQAQEAAVPSPQSCSTSLEALKSLLSTTGHWHD
+FAHLELQGSWELFTTIHTYPKGVGLLARAMVQNHCRQIPAVLRQLLPSLQSPQERERKVA
+ILILTKFLYSPVLLEVLPKQAALTVLAQGLHDPSPEVRVLSLQGLSNILFHPDKGSLLQG
+QLRPLLDGFFQSSDQVIVCIMGTVSDTLHRLGAQGTGSQSLGVAISTRSFFNDERDGIRA
+AAMALFGDLVAAMADRELSGLRTQVHQSMVPLLLHLKDQCPAVATQAKFTFYRCAVLLRW
+RLLHTLFCTLAWERGLSARHFLWTCLMTRSQEEFSIHLSQALSYLHSHSCHIKTWVTLFI
+GHTICYHPQAVFQMLNAVDTNLLFRTFEHLRSDPEPSIREFATSQLSFLQKVSARPKQ
+>ENSP00000429440.1 pep chromosome:GRCh38:8:141494911:141496759:-1 gene:ENSG00000226807.6 transcript:ENST00000521161.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976]
+MKDSSCMKVHQDSARRDRWSHPTTILLHKSQSSQATLMLQEHRMFMGEAYSAATGFKMLQ
+DMNSADPFHLKYIIKKIKNMAHGSPKLVMETIHDYFIDNPEISSRHKFRL
+>ENSP00000481783.1 pep chromosome:GRCh38:8:141433829:141507230:-1 gene:ENSG00000226807.6 transcript:ENST00000621837.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:MROH5 description:maestro heat like repeat family member 5 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:42976]
+MDRQCSERPYSCTPTGRVSSAVSQNSRISPPVSTSMKDSSCMKVHQDSARRDRWSHPTTI
+LLHKSQSSQATLMLQEHRMFMGEAYSAATGFKMLQDMNSADPFHLKYIIKKIKNMAHGSP
+KLVMETIHDYFIDNPEISSRHKFRLFQTLEMVIGASDVLEETWEKTFTRLALENMTKATE
+LEDIYQDAASNMLVAICRHSWRVVAQHLETELLTGVFPHRSLLYVMGVLSSSEELFSQED
+KACWEEQLIQMAIKSVPFLSTDVWSKELLWTLTTPSWTQQEQSPEKAFLFTYYGLILQAE
+KNGATVRRHLQALLETSHQWPKQREGMALTLGLAATRHLDDVWAVLDQFGRSRPIRWSLP
+SSSPKNSEDLRWKWASSTILLAYGQVAAKARAHILPWVDNIVSRMVFYFHYSSWDETLKQ
+SFLTATLMLMGAVSRSEGAHSYEFFQTSELLQCLMVLMEKEPQDTLCTRSRQQAMHIASS
+LCKLRPPIDLERKSQLLSTCFRSVFALPLLDALEKHTCLFLEPPNIQLWPVARERAGWTH
+QGWGPRAVLHCSEHLQSLYSRTMEALDFMLQSLIMQNPTADELHFLLSHLYIWLASEKAH
+ERQRAVHSCMILLKFLNHNGYLDPKEDFKRIGQLVGILGMLCQDPDRATQRCSLEGASHL
+YQLLMCHKTGEALQAESQAPKELSQAHSDGAPLWNSRDQKATPLGPQEMAKNHIFQLCSF
+QVIKDIMQQLTLAELSDLIWTAIDGLGSTSPFRVQAASEMLLTAVQEHGAKLEIVSSMAQ
+AIRLRLCSVHIPQAKEKTLHAITLLARSHTCELVATFLNISIPLDSHTFQLWRALGAGQP
+TSHLVLTTLLACLQERPLPTGASDSSPCPKEKTYLRLLAAMNMLHELQFAREFKQAVQEG
+YPKLFLALLTQMHYVLELNLPSEPPKQQAQEAAVPSPQSCSTSLEALKSLLSTTGHWHDF
+AHLELQGSWELFTTIHTYPKGVGLLARAMVQNHCRQIPAVLRQLLPSLQSPQERERKVAI
+LILTKFLYSPVLLEVLPKQAALTVLAQGLHDPSPEVRVLSLQGLSNILFHPDKGSLLQGQ
+LRPLLDGFFQSSDQVIVCIMGTVSDTLHRLGAQGTGSQSLGVAISTRSFFNDERDGIRAA
+AMALFGDLVAAMADRELSGLRTQVHQSMVPLLLHLKDQCPAVATQAKFTFYRCAVLLRWR
+LLHTLFCTLAWERGLSARHFLWTCLMTRSQEEFSIHLSQALSYLHSHSCHIKTWVTLFIG
+HTICYHPQAVFQMLNAVDTNLLFRTFEHLRSDPEPSIREFATSQLSFLQKVSARPKQ
+>ENSP00000485975.1 pep chromosome:GRCh38:CHR_HG142_HG150_NOVEL_TEST:56210873:56211820:-1 gene:ENSG00000263150.3 transcript:ENST00000570683.3 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR8J2 description:olfactory receptor family 8 subfamily J member 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15311]
+MASGNLTWVTEFILVGVSDDPELQIPLFLVFLVLYLLTVAGNLGIITLTSVDPQLQTPMY
+FFL*HLAIINLCNSTVVAPKMLVNFLVTKKTISYYGCAAQLGGFLVFIVAEIFTLAAMAY
+DRYVAIWSPLLYAVVVSPKVCRLLVSLTYLQSLITALTVSSCVFSVSYCSSNIINHFYCD
+DVPLLALSCSDTYIPETAVFIFSGTNLLFSMIVVLISYFNIVITILRIRSSEGRQKAFST
+CASHMIAVVVFYGTLLFMYLQPRSNHSLDTDKMASVFYTLVIPVLNPLIYSLRNKNVKDA
+LKRFLDNPCRSLKLM
+>ENSP00000460880.1 pep chromosome:GRCh38:CHR_HG142_HG150_NOVEL_TEST:56318307:56319245:1 gene:ENSG00000262755.1 transcript:ENST00000573400.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR8K3 description:olfactory receptor family 8 subfamily K member 3 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15313]
+MEQHNLTTVNEFILTGITDIAELQAPLFALFLMIYVISVMGNLGMIVLTKLDSRLQTPMY
+FFLRHLAFMDLGYSTTVGPKMLVNFVVDKNIISYYFCATQLAFFLVFIGSELFILSAMSY
+DLYVAICNPLLYTVIMSRRVCQVLVAIPYLYCTFISLLVTIKIFTLSFCGYNVISHFYCD
+SLPLLPLLCSNTHEIELIILIFAAIDLISSLLIVLLSYLLILVAILRMNSAGRQKAFSTC
+GAHLTVVIVFYGTLLFMYVQPKSSHSFDTDKVASIFYTLVIPMLNPLIYSLRNKDVKYAL
+RRTWNNLCNIFV
+>ENSP00000468117.2 pep chromosome:GRCh38:10:116621306:116645143:1 gene:ENSG00000266200.6 transcript:ENST00000591655.3 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PNLIPRP2 description:pancreatic lipase related protein 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:9157]
+MLPPWTLGLLLLATVRGKEVCYGQLGCFSDEKPWAGTLQRPVKLLPWSPEDIDTRFLLYT
+NENPNNFQLITGTEPDTIEASNFQLDRKTRFIIHGFLDKAEDSWPSDMCKKMFEVEKVNC
+ICVDWRHGSRAMYTQAVQNIRVVGAETAFLIQALSTQLGYSLEDVHVIGHSLGAHTAAEA
+GRRLGGRVGRITGLDPAGPCFQDEPEEVRLDPSDAVFVDVIHTDSSPIVPSLGFGMSQKV
+GHLDFFPNGGKEMPGCKKNVLSTITDIDGIWEGIGGFVSCNHLRSFEYYSSSVLNPDGFL
+GYPCASYDEFQESKCFPCPAEGCPKMGHYADQFKGKTSAVEQTFFLNTGESGNFTSWRYK
+ISVTLSGKEKVNGYIRIALYGSNENSKQYEIFKGSLKPDASHTCAIDVDFNVGKIQKVKF
+LWNKRGINLSEPKLGASQITVQSGEDGTEYNFCSSDTVEENVLQSLYPC
+>ENSP00000480815.1 pep chromosome:GRCh38:10:116621306:116645097:1 gene:ENSG00000266200.6 transcript:ENST00000611850.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PNLIPRP2 description:pancreatic lipase related protein 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:9157]
+MLPPWTLGLLLLATVRGKEVCYGQLGCFSDEKPWAGTLQRPVKLLPWSPEDIDTRFLLYT
+NENPNNFQLITGTEPDTIEASNFQLDRKTRFIIHGFLDKAEDSWPSDMCKKMFEVEKVNC
+ICVDWRHGSRAMYTQAVQNIRVVGAETAFLIQALSTQLGYSLEDVHVIGHSLGAHTAAEA
+GRRLGGRVGRITGLDPAGPCFQDEPEEVRLDPSDAVFVDVIHTDSSPIVPSLGFGMSQKV
+GHLDFFPNGGKEMPGCKKNVLSTITDIDGIWEGIGGFVSCNHLRSFEYYSSSVLNPDGFL
+GYPCASYDEFQESKCFPCPAEGCPKMGHYADQFKGKTSAVEQTFFLNTGESGNFTSRYKI
+SVTLSGKEKVNGYIRIALYGSNENSKQYEIFKGSLKPDASHTCAIDVDFNVGKIQKVKFL
+WNKRGINLSEPKLGASQITVQSGEDGTEYNFCSSDTVEENVLQSLYPC
+>ENSP00000463502.4 pep chromosome:GRCh38:10:116620953:116645143:1 gene:ENSG00000266200.6 transcript:ENST00000579578.6 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:PNLIPRP2 description:pancreatic lipase related protein 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:9157]
+MLPPWTLGLLLLATVRGKEVCYGQLGCFSDEKPWAGTLQRPVKLLPWSPEDIDTRFLLYT
+NENPNNFQLITGTEPDTIEASNFQLDRKTRFIIHGFLDKAEDSWPSDMCKKMFEVEKVNC
+ICVDWRHGSRAMYTQAVQNIRVVGAETAFLIQALSTQLGYSLEDVHVIGHSLGAHTAAEA
+GRRLGGRVGRITGLDPAGPCFQDEPEEVRLDPSDAVFVDVIHTDSSPIVPSLGFGMSQKV
+GHLDFFPNGGKEMPGCKKNVLSTITDIDGIWEGIGGFVSCNHLRSFEYYSSSVLNPDGFL
+GYPCASYDEFQESKCFPCPAEGCPKMGHYADQFKGKTSAVEQTFFLNTGESGNFTS*RYK
+ISVTLSGKEKVNGYIRIALYGSNENSKQYEIFKGSLKPDASHTCAIDVDFNVGKIQKVKF
+LWNKRGINLSEPKLGASQITVQSGEDGTEYNFCSSDTVEENVLQSLYPC
+>ENSP00000485881.1 pep chromosome:GRCh38:CHR_HG151_NOVEL_TEST:56830655:56831590:-1 gene:ENSG00000281107.1 transcript:ENST00000631283.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR5G3 description:olfactory receptor family 5 subfamily G member 3 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15287]
+MEDKNQTVVTEFLLLGLTDHPYQKIVLFFMFLFVYLITLGGNLGMITLIWIDPRLHTPMY
+FFLRHLSFVDICSSSSVVPKMLCNIFAEKKDITFLGCAAQMWFFGLFEAAECFLWLPWHM
+TGMWPSASPCCIRSLCLSRSVCSWW*GLMPWLL*AP*LIQFSLFAYPFVVQILSITFSVI
+FFHCFP*HVQTPG*INLCCLSWLEL*EYSVV*SSWSPIFAS**PS*RSRLLMGSKKLSSP
+VFLTLRLSPSCMGLFS*FMFGQVQVPPWVSIK*FLYFILW*SPWLTPLFTA*GIRR*KMH
+SEEKLRGKNLL
+>ENSP00000407375.1 pep chromosome:GRCh38:3:49357176:49358353:-1 gene:ENSG00000233276.5 transcript:ENST00000419783.3 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN
+ELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGA
+GAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYS
+RRFQTIDIEPDIEALLSQGPSCA
+>ENSP00000391316.1 pep chromosome:GRCh38:3:49357178:49358312:-1 gene:ENSG00000233276.5 transcript:ENST00000419349.2 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN
+ELQRRLGPRGLVVLGFPCNQFGHQVRRAERGGAGADVQ
+>ENSP00000495108.1 pep chromosome:GRCh38:3:49357201:49358325:-1 gene:ENSG00000233276.5 transcript:ENST00000643797.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFE
+PNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEK
+FLVGPDGVPLRRYSRRFQTIDIEPDIEALLSQGPSCA
+>ENSP00000495001.1 pep chromosome:GRCh38:3:49357477:49358325:-1 gene:ENSG00000233276.5 transcript:ENST00000646881.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASL*GTTVRDYTQMN
+ELQRRLGPRGLVVLGFPCNQFGHQLPFSPVGERQERRDSEFPQVRPAWWWVRAQLHALRE
+VRGERCGGAPSLRLPAGGPASSQRRRHRAYDRPQAHHLVSGVSQRCCLEL
+>ENSP00000493593.1 pep chromosome:GRCh38:3:49357608:49358325:-1 gene:ENSG00000233276.5 transcript:ENST00000496791.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASL*GTTVRDYTQMN
+ELQRRLGPRGLVVLGFPCNQFGHQLLKFLSALGIAHGGQNPGDS
+>ENSP00000498820.1 pep chromosome:GRCh38:3:49357506:49358278:-1 gene:ENSG00000233276.5 transcript:ENST00000651740.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN
+ELQRRLGPRGLVVLGFPCNQFGHQLPFSPVGERQERRDSEFPQVRPAWWWVRAQLHALRE
+VRGERCGGAPSLRLPAGGPASSQRRRHRAYDRPQAHHLVSGVSQRCCLEL
+>ENSP00000499000.1 pep chromosome:GRCh38:3:49357826:49358278:-1 gene:ENSG00000233276.5 transcript:ENST00000651279.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMN
+ELQRRLGPRGLVVLGFPCNQFGHQLLKFLSALGIAHGGQNPGDS
+>ENSP00000478837.1 pep chromosome:GRCh38:3:49357180:49358358:-1 gene:ENSG00000233276.5 transcript:ENST00000620890.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
+MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLGTTVRDYTQMNE
+LQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAG
+AHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSR
+RFQTIDIEPDIEALLSQGPSCA
+>ENSP00000486888.2 pep chromosome:GRCh38:14:94364313:94366698:-1 gene:ENSG00000258597.3 transcript:ENST00000553483.3 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:SERPINA2 description:serpin family A member 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:8985]
+IPFSVSWGVLLLAGLCCLVPSSLVEDPQEDAAQKTDTSHHDQGDWEDLACQKISYNVTDL
+AFDLYKELADLSQTSNVLVTPTSVAMAFAMLSLGTKADTRTEILEGLNVNLTETPEAKIH
+ECFQQVLQALSRPDTRLQLTTGSSLFVNKSMKLVDTFLEDTKKLYHSEASSINFRDTEEA
+KEQINNYVEKRTGRKVVDLVKHLKKDTSLALVDYISFHGKWKDKFKAERIMVEGFHVDDK
+TIIRVPMINHLGRFDIHRDRELSSWVLAQHYVGNATAFFILPDPKKMWQLEEKLTYSHLE
+NIQRAFDIR*
+>ENSP00000486005.1 pep chromosome:GRCh38:CHR_HSCHR17_2_CTG2:1201029:1203765:1 gene:ENSG00000280938.1 transcript:ENST00000626647.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:TRARG1 description:trafficking regulator of GLUT4 (SLC2A4) 1 [Source:HGNC Symbol;Acc:HGNC:29592]
+XQKK
+>ENSP00000434644.1 pep chromosome:GRCh38:16:81100889:81181324:-1 gene:ENSG00000166473.17 transcript:ENST00000533478.5 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715]
+MGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRG
+EVIRIRATALTRHAYGEDTYVISTVPPREVPACTIAPEEGTVLTSFAIFCNASTALGPLE
+FCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVLTVVISATNRAGDTQQTQAMAKVALG
+DTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAKAVSSMLNQEHESQGSGQSLSIDVRQ
+KVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTCRSKELTPSAQWEASLALQHASEALL
+TVSAKARPEDQRRQAATRDLFQAVGSVLEASLSNRPEEPAEASSSQIATVLRLLRVMEHV
+QTTLLLGKLPGGLPAMLATPSISVYTNRIQPWSWQGSSLRPDAADSATFMLPAASSLSSL
+EGGQEPVDIKIMSFPKSPFPARSHFDVSGTVGGLRVTSPSGQLIPVKNLSENIEILLPRH
+SQRHSQPTVLNLTSPEALWVNVTSGEATLGIQLHWRPDIALTLSLGYGYHPNKSSYDAQT
+HLVPMVAPDELPTWILSPQDLRFGEGVYYLTVVPESDLEPAPGRDLTVGITTFLSHCVFW
+DEVQETWDDSGCQVGPRTSPYQTHCLCNHLTFFGSTFLVMSNAINIHQTAELFATFEDNP
+VVVTTVGCLCVVYVLVVIWARRKDAQDQAKVKVTVLEDNDPFAQYHYLVTVYTGHRRGAA
+TSSKVTVTLYGLDGEREPHHLADPDTPVFERGAVDAFLLSTLFPLGELRSLRLWHDNSGD
+RPSWYVSRVLVYDLVMDRKWYFLCNSWLSINVGDCVLDKVFPVATEQDRKQFSHLFFMKT
+SAGFQDGHIWYSIFSRCARSSFTRVQRVSCCFSLLLCTMLTSIMFWGVPKDPAEQKMDLG
+KIEFTWQEVMIGLESSILMFPINLLIVQIFQNTRPRVAKEQNTGKWDRGSPNLTPSPQPM
+EDGLLTPEAVTKDVSRIVSSLFKALKVPSPALGWDSVNLMDINSLLALVEDVIYPQNTSG
+QVFWEEAKKREDPVTLTLGSSEMKEKSQCPKPKAARSGPWKDSAYRQCLYLQLEHVEQEL
+RLVGPRGFSQPHSHAQALRQLQTLKGGLGVQPGTWAPAHASALQVSKPPQGLPWWCILVG
+WLLVAATSGVAAFFTMLYGLHYGRASSLRWLISMAVSFVESMFVTQPLKVLGFAAFFALV
+LKRVDDEEDTVAPLPGHLLGPDPYALFRARRNSSRDVYQPPLTAAIEKMKTTHLKEQKAF
+ALIREILAYLGFLWMLLLVAYGQRDPSAYHLNRHLQHSFTRGFSGVLGFREFFKWANTTL
+VSNLYGHPPGFITDGNSKLVGSAQIRQVRVQESSCPLAQQPQAYLNGCRAPYSLDAEDMA
+DYGEGWNATTLSNGSTRARTNVKGIPSGANSLCTGEEATWSPWGLIAKARQEFSAISLTT
+PGWTP*PELCLWSPLSTTPTSTCSALSR*R*RPALWAPFLRTRPCRASACTPSPTAGTPS
+W*RQSSSTSSSSSTTWWCRASA*VKRRGAISAASGTFWSWPSSWPAGAPWRCL*RGLSWP
+KGTSSAAGTTGRKASASVRQQQPMPPLATSLPSWYSCPQ*SFGICSG*IPK*T*SRQPYA
+VPGATFQAL*LSSLPCSWLTPSRQT*YLVGNSVPTKPSLMRRRRWSAFSWESSTTRRSWT
+IAQCLAPSSLDPALFL*HLWC*TCLSLSSWWPSVRSKNTISCRRKGRS*ICC**KYSVSW
+ALSLRERSLEAAGSSLGLCPRLATLDQHKLCPRT
+>ENSP00000434417.1 pep chromosome:GRCh38:16:81101123:81220370:-1 gene:ENSG00000166473.17 transcript:ENST00000525539.5 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715]
+MSAVGLVLLVLALRLRATTVKPEEGSFCSNSQVAFRDACYEFVPLGRTFRDAQSWCEGQG
+GHLVFIQDEGTQWFLQKHISQDREWWIGLTWNLARNGTTEGPGTWLDTSNVTYSNWHGGQ
+AAAAPDTCGHIGRGPSSEWVTSDCAQTFAFMCEFRVGQSLACEGLNATVHCGLGQVIQVQ
+DAVYGRQNPHFCTQDAGRPSDLEQGCSWANVKEEVAGQCQELQSCQVAADETYFGNLCPT
+QGSYLWVQYQCREALQLMVSSESFIFDNVTISLTWLLSPYIGNLSCIISTGDSHTFDPYN
+PPSVSSNVTHQFTSPGEFTVFAECTTSEWHVTAQRQVTVRDKMETLSVTACSGLSQSGAG
+PLCQAVFGDPLWIQVELDGGTGVTYTVLLGDITLAESTTQKGSLPYNLILDRETQKLMGP
+GRHRLEIQATGNTTTSTISRNITVHLVELLSGLQASWASDHLELGQDLLITISLAQGTPE
+ELTFEVAGLNATFSHEQVSFGEPFGICRLAVPVEGTFLVTMLVRNAFSNLSLEIGNITIT
+APSGLQEPSGMNAEGKSKDKGDMEVYIQPGPYVDPFTTVTLGWPDNDKELRFQWSCGSCW
+ALWSSCVERQLLRTDQRELVVPASCLPPPDSAVTLRLAVLRGQELENRAEQCLYVSAPWE
+LRPRVSCERNCRPVNASKDILLRVTMGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFW
+PRSLTLLQSNTSTLLLNSSFLQSRGEVIRIRATALTRHAYGEDTYVISTVPPREVPACTI
+APEEGTVLTSFAIFCNASTALGPLEFCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVL
+TVVISATNRAGDTQQTQAMAKVALGDTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAK
+AVSSMLNQEHESQGSGQSLSIDVRQKVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTC
+RSKELTPSAQWEASLALQHASEALLTVSAKARPEDQRRQAATRDLFQAVGSVLEASLSNR
+PEEPAEASSSQIATVLRLLRVMEHVQTTLLLGKLPGGLPAMLATPSISVYTNRIQPWSWQ
+GSSLRPDAADSATFMLPAASSLSSLEGGQEPVDIKIMSFPKSPFPARSHFDVSGTVGGLR
+VTSPSGQLIPVKNLSENIEILLPRHSQRHSQPTVLNLTSPEALWVNVTSGEATLGIQLHW
+RPDIALTLSLGYGYHPNKSSYDAQTHLVPMVAPDELPTWILSPQDLRFGEGVYYLTVVPE
+SDLEPAPGRDLTVGITTFLSHCVFWDEVQETWDDSGCQVGPRTSPYQTHCLCNHLTFFGS
+TFLVMSNAINIHQTAELFATFEDNPVVVTTVGCLCVVYVLVVIWARRKDAQDQAKVKVTV
+LEDNDPFAQYHYLVTVYTGHRRGAATSSKVTVTLYGLDGEREPHHLADPDTPVFERGAVD
+AFLLSTLFPLGELRSLRLWHDNSGDRPSWYVSRVLVYDLVMDRKWYFLCNSWLSINVGDC
+VLDKVFPVATEQDRKQFSHLFFMKTSAGFQDGHIWYSIFSRCARSSFTRVQRVSCCFSLL
+LCTMLTSIMFWGVPKDPAEQKMDLGKIEFTWQEVMIGLESSILMFPINLLIVQIFQNTRP
+RVAKEQNTGKWDRGSPNLTPSPQPMEDGLLTPEAVTKDVSRIVSSLFKALKVPSPALGWD
+SVNLMDINSLLALVEDVIYPQNTSGQVFWEEAKKREDPVTLTLGSSEMKEKSQCPKPKAA
+RSGPWKDSAYRQCLYLQLEHVEQELRLVGPRGFSQPHSHAQALRQLQTLKGGLGVQPGTW
+APAHASALQVSKPPQGLPWWCILVGWLLVAATSGVAAFFTMLYGLHYGRASSLRWLISMA
+VSFVESMFVTQPLKVLGFAAFFALVLKRVDDEEDTVAPLPGHLLGPDPYALFRARRNSSR
+DVYQPPLTAAIEKMKTTHLKEQKAFALIREILAYLGFLWMLLLVAYGQRDPSAYHLNRHL
+QHSFTRGFSGVLGFREFFKWANTTLVSNLYGHPPGFITDGNSKLVGSAQIRQVRVQESSC
+PLAQQPQAYLNGCRAPYSLDAEDMADYGEGWNATTLSNGSTRARTNVKGIPSGANSLCTG
+EEATWSPWGLIAKARQEFSAISLTTPGWTP*PELCLWSPLSTTPTSTCSALSR*R*RPAL
+WAPFLRTRPCRASACTPSPTAGTPSW*RQSSSTSSSSSTTWWCRASA*VKRRGAISAASG
+TFWSWPSSWPAGAPWRCL*RGLSWPKGTSSAAGTTGRKASASVRQQQPMPPLATSLPSWY
+SCPQ*SFGICSG*IPK*T*SRQPYAVPGATFQAL*LSSLPCSWLTPSRQT*YLVGNSVPT
+KPSLMRRRRWSAFSWESSTTRRSWTIAQCLAPSSLDPALFL*HLWC*TCLSLSSWWPSVR
+SKNTISCRRKGRS*ICC**KYSVSWALSLRERSLEAAGSSLGLCPRLATLDQHKLCPRT
+>ENSP00000436309.1 pep chromosome:GRCh38:16:81170289:81181329:-1 gene:ENSG00000166473.17 transcript:ENST00000531391.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715]
+MGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRG
+EVIRIRATALTRHAYGEDTYVISTVPPREVPACTIAPEEGTVLTSFAIFCNASTALGPLE
+FCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVLTVVISATNRAGDTQQTQAMAKVALG
+DTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAKAVSSMLNQEHESQGSGQSLSIDVRQ
+KVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTCRSKELTPSAQGSCMGDSWEGAPPAA
+HVSHAR
+>ENSP00000436389.1 pep chromosome:GRCh38:16:81170292:81198786:-1 gene:ENSG00000166473.17 transcript:ENST00000526632.5 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715]
+XLAQGTPEELTFEVAGLNATFSHEQVSFGEPFGICRLAVPVEGTFLVTMLVRNAFSNLSL
+EIGNITITAPSGLQEPSGMNAEGKSKDKGDMEVYIQPGPYVDPFTTVTLGWPDNDKELRF
+QWSCGSCWALWSSCVERQLLRTDQRELVVPASCLPPPDSAVTLRLAVLRGQELENRAEQC
+LYVSAPWELRPRVSCERNCRPVNASKDILLRVTMGEDSPVAMFSWYLDNTPTEQAEPLLD
+ACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRGEVIRIRATALTRHAYGEDTYVISTVPP
+REVPACTIAPEEGTVLTSFAIFCNASTALGPLEFCFCLESGSCLHCGPEPALPSVYLPLG
+EENNDFVLTVVISATNRAGDTQQTQAMAKVALGDTCVEDVAFQAAVSEKIPTALQGEGGP
+EQLLQLAKAVSSMLNQEHESQGSGQSLSIDVRQKVREHVLGSLSAVTTGLEDVQRVQELA
+EVLREVTCRSKELTPSAQGSCMGDSWEGAPPAAHVSHAR
+>ENSP00000432818.1 pep chromosome:GRCh38:16:81171215:81181327:-1 gene:ENSG00000166473.17 transcript:ENST00000527937.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715]
+MGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFWPRSLTLLQSNTSTLLLNSSFLQSRG
+EVIRIRATGSCLHCGPEPALPSVYLPLGEENNDFVLTVVISATNRAGDTQQTQAMAKVAL
+GDTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAKAVSSMLNQEHESQGSGQSLSIDVR
+QKVPVGSWGAPFIPFLWGPRVCVRPFGLWIKVHGSGEKPVVSPKRLTPPPSLVFWVSDIK
+>ENSP00000337397.4 pep chromosome:GRCh38:16:81170289:81220370:-1 gene:ENSG00000166473.17 transcript:ENST00000337114.8 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:PKD1L2 description:polycystin 1 like 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:21715]
+MSAVGLVLLVLALRLRATTVKPEEGSFCSNSQVAFRDACYEFVPLGRTFRDAQSWCEGQG
+GHLVFIQDEGTQWFLQKHISQDREWWIGLTWNLARNGTTEGPGTWLDTSNVTYSNWHGGQ
+AAAAPDTCGHIGRGPSSEWVTSDCAQTFAFMCEFRVGQSLACEGLNATVHCGLGQVIQVQ
+DAVYGRQNPHFCTQDAGRPSDLEQGCSWANVKEEVAGQCQELQSCQVAADETYFGNLCPT
+QGSYLWVQYQCREALQLMVSSESFIFDNVTISLTWLLSPYIGNLSCIISTGDSHTFDPYN
+PPSVSSNVTHQFTSPGEFTVFAECTTSEWHVTAQRQVTVRDKMETLSVTACSGLSQSGAG
+PLCQAVFGDPLWIQVELDGGTGVTYTVLLGDITLAESTTQKGSLPYNLILDRETQKLMGP
+GRHRLEIQATGNTTTSTISRNITVHLVELLSGLQASWASDHLELGQDLLITISLAQGTPE
+ELTFEVAGLNATFSHEQVSFGEPFGICRLAVPVEGTFLVTMLVRNAFSNLSLEIGNITIT
+APSGLQEPSGMNAEGKSKDKGDMEVYIQPGPYVDPFTTVTLGWPDNDKELRFQWSCGSCW
+ALWSSCVERQLLRTDQRELVVPASCLPPPDSAVTLRLAVLRGQELENRAEQCLYVSAPWE
+LRPRVSCERNCRPVNASKDILLRVTMGEDSPVAMFSWYLDNTPTEQAEPLLDACRLRGFW
+PRSLTLLQSNTSTLLLNSSFLQSRGEVIRIRATALTRHAYGEDTYVISTVPPREVPACTI
+APEEGTVLTSFAIFCNASTALGPLEFCFCLESGSCLHCGPEPALPSVYLPLGEENNDFVL
+TVVISATNRAGDTQQTQAMAKVALGDTCVEDVAFQAAVSEKIPTALQGEGGPEQLLQLAK
+AVSSMLNQEHESQGSGQSLSIDVRQKVREHVLGSLSAVTTGLEDVQRVQELAEVLREVTC
+RSKELTPSAQGSCMGDSWEGAPPAAHVSHAR
+>ENSP00000385765.6 pep chromosome:GRCh38:22:23980058:23983710:1 gene:ENSG00000099984.11 transcript:ENST00000402588.6 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GSTT2 description:glutathione S-transferase theta 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:4642]
+MGLELFLDLVSQPSRAVYIFAKKNGIPLELRTVDLVKGQHKSKEFLQINSLGKLPTLKDG
+DFILTESSAILIYLSCKYQTPDHWYPSDLQARARVHEYLGWHADCIRGTFGIPLWVQMLG
+PLIGVQVPEEKVERNRTAMDQALQWLEDKFLGDRPFLAGQQPVALGYELFEGRPRLAAWR
+G*VEAFLGAELCQEAHSIILSILEQAAKKTLPTPSPEAYQAMLLRIARIP
+>ENSP00000488993.1 pep chromosome:GRCh38:22:23980058:23983915:1 gene:ENSG00000099984.11 transcript:ENST00000634759.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:GSTT2 description:glutathione S-transferase theta 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:4642]
+MGLELFLDLVSQPSRAVYIFAKKNGIPLELRTVDLVKGQHKSKEFLQINSLGKLPTLKDG
+DFILTESSAILIYLSCKYQTPDHWYPSDLQARARVHEYLGWHADCIRGTFGIPLWVQMLG
+PLIGVQVPEEKVERNRTAMDQALQWLEDKFLGDRPFLAGQQVTLADLMALEELMQPVALG
+YELFEGRPRLAAWRG*VEAFLGAELCQEAHSIILSILEQAAKKTLPTPSPEAYQAMLLRI
+ARIP
+>ENSP00000481555.1 pep chromosome:GRCh38:22:23980123:23983911:1 gene:ENSG00000099984.11 transcript:ENST00000621118.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GSTT2 description:glutathione S-transferase theta 2 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:4642]
+MGLELFLDLVSQPSRAVYIFAKKNGIPLELRTVDLVKGQHKSKEFLQINSLGKLPTLKDG
+DFILTESSAILIYLSCKYQTPDHWYPSDLQARARVHEYLGWHADCIRGTFGIPLWVQMLG
+PLIGVQVPEEKVERNRTAMDQALQWLEDKFLGDRPFLAGQQVTLADLMALEELMQPVALG
+YELFEGRPRLAAWRGVEAFLGAELCQEAHSIILSILEQAAKKTLPTPSPEAYQAMLLRIA
+RIP
+>ENSP00000493452.1 pep chromosome:GRCh38:1:247770169:247779524:1 gene:ENSG00000228336.2 transcript:ENST00000446393.2 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR9H1P description:olfactory receptor family 9 subfamily H member 1 pseudogene [Source:HGNC Symbol;Acc:HGNC:15038]
+MVNFTHVSEFVLLGFQGGPGMQAMLFLIFLILYGIAVVGNLGMIVIIWVDAHLHTPMYAF
+LQSLSLLDICYSSTIAPRALANSMQEDHTISFGGCAAQFFFLSLFGITEAFLLAAMAYDR
+FIAICNPLLYSVSMSHQVCVLLISGSYLWGVVNAIAQTTMTFRLPFCGSNEINDFFCDVP
+PLLSLSCSDTFINQLVLLGLCGSIIVSTFLIVLVSYIYIISTILRIPTMQGR*KAFSTCA
+SHLTGVCLFFGTVFFMYAQPSAIFFMEQSKIVSIFYTMVIPMLNPLIYSLRNKEVKQALR
+RSMQKLSL
+>ENSP00000493221.1 pep chromosome:GRCh38:1:248436359:248444316:1 gene:ENSG00000227152.6 transcript:ENST00000641557.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR2T7 description:olfactory receptor family 2 subfamily T member 7 [Source:HGNC Symbol;Acc:HGNC:15019]
+MEQSNYSVYADFILLGLFSNARFPWLLCPHSPGLCDLHSQQRGQDHSHPHRLPPPHPHVL
+PAQPALPQGHPVYFHHCAQNAGRPGDEPESHFLCWMHCPTLPLLDLSRG*VLPPRTHVL*
+SLRSHLQPSALS*PHEPQDLLVDCGGSLAGRVYRWFLAHPRHHAVPLLCLSGDQPLLLRG
+ACPSEALLHGHISLRDSHVCLLYYDAPHPFLCDLGLLHKNSHYCL*DERGRGEAKGCGHL
+LLTHGGCQPLLWGCHVHIRAASFLPHP*AGQSCICLLHHPHSHAQSTHLQP*EQGCHGGP
+TEGCWEVCVLRKGNHFL
+>ENSP00000493243.1 pep chromosome:GRCh38:1:248439145:248444316:1 gene:ENSG00000227152.6 transcript:ENST00000641057.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR2T7 description:olfactory receptor family 2 subfamily T member 7 [Source:HGNC Symbol;Acc:HGNC:15019]
+MEQSNYSVYADFILLGLFSNARFPWLLCPHSPGLCDLHSQQRGQDHSHPHRLPPPHPHVL
+PAQPALPQGHPVYFHHCAQNAGRPGDEPESHFLCWMHCPTLPLLDLSRG*VLPPRTHVL*
+SLRSHLQPSALS*PHEPQDLLVDCGGSLAGRVYRWFLAHPRHHAVPLLCLSGDQPLLLRG
+ACPSEALLHGHISLRDSHVCLLYYDAPHPFLCDLGLLHKNSHYCL*DERGRGEAKGCGHL
+LLTHGGCQPLLWGCHVHIRAASFLPHP*AGQSCICLLHHPHSHAQSTHLQP*EQGCHGGP
+TEGCWEVCVLRKGNHFL
+>ENSP00000475160.1 pep chromosome:GRCh38:3:98064472:98065396:1 gene:ENSG00000213439.3 transcript:ENST00000429239.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR5AC1 description:olfactory receptor family 5 subfamily AC member 1 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:15047]
+MAEENKILVTHFVLTGLTDHPGLQAPLFLVFLVIYLITLVGNLGLMALIWKDPHLHTPIY
+LFLGSLAFADACTSSSVTSKMLINFFIKESYAIHG*VCHPVLLFWFQCNHRMLPAGSDGL
+*PLCSHMQSLALSSGDVQ*PLYSVYRYFIFYWFSAFSDSCGFVI*INFLQVQYYTLFLL*
+NFTAVQNFLHQSYS*YTSDFHLFSIYTSLHFYDSYRLLLLYSLCHPEKEV*EG*KQSLLY
+LQCPSALCLFVLRHPLLHVCEF*VWISCRSGQNVFFILHNNNSFTKSFYLQPKEQRGYRC
+PEKNHEEI
+>ENSP00000476186.1 pep chromosome:GRCh38:3:98306752:98312843:1 gene:ENSG00000232535.2 transcript:ENST00000394191.3 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:OR5H8 description:olfactory receptor family 5 subfamily H member 8 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:14773]
+MDDENATLLTEFVLTGLTYQSEWKIPLFLAFLVIYLITIMANLGLIAVIWKDSHLHIPMY
+LFLGSLAFVDAWLSSSVTPKMLISFLAKSMIISVSECKIQFFSFGISGTTECFLLATMAY
+DRYVAICKPLLYPVIMTNGLCIWLLVLSFIGGFLHALIHEGILFRLTFCNSNIIHHFYCD
+IIPLLKISCTDPSINFLMLFILSGSIQVFTILTVLVSYTFVLFTILKKKVCQRHKESLFH
+LWSPSLICFFILWPPSLHVCAPCISTSR*SRYGGVSILHCHNSFLKSHYLQPEK*ASHRF
+TDKNIKRKCL
+>ENSP00000492977.1 pep chromosome:GRCh38:17:74560701:74567343:-1 gene:ENSG00000284690.2 transcript:ENST00000641710.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:CD300H description:CD300H molecule (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:52292]
+MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQP
+CLPIWHEMVETGGSEGVVRSDQVIITDHPGDLTFTVTLENLTADDAGKYRCGIATILQED
+GLSGFLPDPFFQVQVLVSSASSTENSVKTPASPTRPSQCQGSLPSSTCFLLLPLLKVPLL
+LSILGAILWVNRPWRTPWTES
+>ENSP00000492997.1 pep chromosome:GRCh38:17:74560781:74567343:-1 gene:ENSG00000284690.2 transcript:ENST00000641031.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:CD300H description:CD300H molecule (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:52292]
+MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQP
+CLPIWHEMVETGGSEGVVRSDQVIITDHPGDLTFTVTLENLTADDAGKYRCGIATILQED
+GLSGFLPDPFFQVQVLVSSGPCPAAPASCFSHS
+>ENSP00000498753.1 pep chromosome:GRCh38:17:74560781:74567343:-1 gene:ENSG00000284690.2 transcript:ENST00000651881.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:CD300H description:CD300H molecule (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:52292]
+MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQP
+CLPIWHEMVETGGSEGVVRSDQVIITDHPGDLTFTVTLENLTADDAGKYRCGIATILQED
+GLSGFLPDPFFQVQVLVSSGPCPAAPASCFSHS
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.fa.gz b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.fa.gz
new file mode 100644
index 00000000..db7dd483
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.fa.gz differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-11.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-11.bloomfilter.nodegraph
new file mode 100644
index 00000000..287c4f91
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-11.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-12.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-12.bloomfilter.nodegraph
new file mode 100644
index 00000000..049f2f46
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-12.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-5.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-5.bloomfilter.nodegraph
new file mode 100644
index 00000000..b57b3ae0
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-5.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-6.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-6.bloomfilter.nodegraph
new file mode 100644
index 00000000..c3869dc3
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-6.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-7.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-7.bloomfilter.nodegraph
new file mode 100644
index 00000000..61ab111c
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-dayhoff_ksize-7.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-15.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-15.bloomfilter.nodegraph
new file mode 100644
index 00000000..bf7d5b91
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-15.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-16.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-16.bloomfilter.nodegraph
new file mode 100644
index 00000000..7fe2c05f
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-16.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-20.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-20.bloomfilter.nodegraph
new file mode 100644
index 00000000..aa542aab
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-20.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-21.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-21.bloomfilter.nodegraph
new file mode 100644
index 00000000..c8be607b
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-21.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-22.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-22.bloomfilter.nodegraph
new file mode 100644
index 00000000..534ac1a7
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-22.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-23.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-23.bloomfilter.nodegraph
new file mode 100644
index 00000000..493adf9b
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-23.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-25.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-25.bloomfilter.nodegraph
new file mode 100644
index 00000000..39c67711
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-25.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-5.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-5.bloomfilter.nodegraph
new file mode 100644
index 00000000..70534626
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-5.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-6.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-6.bloomfilter.nodegraph
new file mode 100644
index 00000000..f6cd7951
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-6.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-7.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-7.bloomfilter.nodegraph
new file mode 100644
index 00000000..a8d26947
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-hydrophobic-polar_ksize-7.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-5.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-5.bloomfilter.nodegraph
new file mode 100644
index 00000000..813b953a
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-5.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-6.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-6.bloomfilter.nodegraph
new file mode 100644
index 00000000..b3bbebae
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-6.bloomfilter.nodegraph differ
diff --git a/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph
new file mode 100644
index 00000000..9ae4ab52
Binary files /dev/null and b/tests/data/bloom_filter/Homo_sapiens.GRCh38.pep.subset.molecule-protein_ksize-7.bloomfilter.nodegraph differ
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-11.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-11.csv
new file mode 100644
index 00000000..815573df
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-11.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,12.0,Coding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,13.0,Non-coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.0,12.0,Non-coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.16666666666666666,12.0,Non-coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.125,8.0,Non-coding
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,14.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.07692307692307693,13.0,Non-coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.1,10.0,Non-coding
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.0,12.0,Non-coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,12.0,Non-coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.07692307692307693,13.0,Non-coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.08333333333333333,12.0,Non-coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,12.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,12.0,Non-coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.07142857142857142,14.0,Non-coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,13.0,Non-coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in dayhoff encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-12.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-12.csv
new file mode 100644
index 00000000..d85c2549
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-12.csv
@@ -0,0 +1,23 @@
+,read_id,jaccard_in_peptide_db,n_kmers,classification
+0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,11.0,Coding
+1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.08333333333333333,12.0,Non-coding
+2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.09090909090909091,11.0,Non-coding
+3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.0,11.0,Non-coding
+4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.0,7.0,Non-coding
+5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,13.0,Coding
+6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.0,12.0,Non-coding
+7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.0,9.0,Non-coding
+8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.09090909090909091,11.0,Non-coding
+9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,11.0,Non-coding
+10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.08333333333333333,12.0,Non-coding
+11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.09090909090909091,11.0,Non-coding
+13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,11.0,Coding
+14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,11.0,Non-coding
+15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,0.0,Read length was shorter than 3 * preptide k-mer size
+18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.07692307692307693,13.0,Non-coding
+19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,12.0,Non-coding
+20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide
+21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,6.0,Low complexity peptide
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-6.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-6.csv
new file mode 100644
index 00000000..2c79d464
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-6.csv
@@ -0,0 +1,23 @@
+,read_id,jaccard_in_peptide_db,n_kmers,classification
+0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,16.0,Coding
+1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,18.0,Coding
+2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,17.0,Coding
+3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,17.0,Coding
+4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,13.0,Coding
+5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,19.0,Coding
+6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,18.0,Coding
+7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,15.0,Coding
+8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,17.0,Coding
+9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.8823529411764706,17.0,Coding
+10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,18.0,Coding
+11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,16.0,Coding
+13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,17.0,Coding
+14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,17.0,Coding
+15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,6.0,Coding
+18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,19.0,Coding
+19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,18.0,Coding
+20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide
+21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,6.0,Low complexity peptide
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-7.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-7.csv
new file mode 100644
index 00000000..30ed369d
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-7.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,16.0,Coding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.9411764705882353,17.0,Coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,16.0,Coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,16.0,Coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,12.0,Coding
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,18.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.7647058823529411,17.0,Coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.7857142857142857,14.0,Coding
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,15.0,Coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.25,16.0,Non-coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,17.0,Coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,16.0,Coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,16.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.625,16.0,Coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,5.0,Coding
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.9444444444444444,18.0,Coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.9411764705882353,17.0,Coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in dayhoff encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-8.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-8.csv
new file mode 100644
index 00000000..ea1e3a37
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-dayhoff_ksize-8.csv
@@ -0,0 +1,23 @@
+,read_id,jaccard_in_peptide_db,n_kmers,classification
+0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,15,coding
+1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.8125,16,non-coding
+2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.7333333333333333,15,non-coding
+3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.7857142857142857,14,non-coding
+4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,11,coding
+5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,17,coding
+6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.5,16,non-coding
+7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.5384615384615384,13,non-coding
+8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.7142857142857143,14,non-coding
+9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.06666666666666667,15,non-coding
+10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.8125,16,non-coding
+11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,0.0,0,non-coding
+12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.4666666666666667,15,non-coding
+13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,15,coding
+14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.13333333333333333,15,non-coding
+15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,0.0,0,non-coding
+16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,0.0,0,non-coding
+17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,4,coding
+18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.5882352941176471,17,non-coding
+19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.625,16,non-coding
+20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,-1.0,2,low complexity nucleotide
+21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,-2.0,6,low complexity peptide
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-16.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-16.csv
new file mode 100644
index 00000000..89acc80a
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-16.csv
@@ -0,0 +1,23 @@
+,read_id,jaccard_in_peptide_db,n_kmers,classification
+0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,6.0,Coding
+1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,8.0,Coding
+2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,7.0,Coding
+3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,7.0,Coding
+4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,3.0,Coding
+5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,9.0,Coding
+6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,8.0,Coding
+7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,5.0,Coding
+8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,7.0,Coding
+9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,1.0,7.0,Coding
+10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,8.0,Coding
+11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,7.0,Coding
+13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,7.0,Coding
+14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,7.0,Coding
+15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,0.0,Read length was shorter than 3 * preptide k-mer size
+18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,9.0,Coding
+19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,8.0,Coding
+20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide
+21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,1.0,Low complexity peptide
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-20.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-20.csv
new file mode 100644
index 00000000..634de4f9
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-20.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,,2.0,Low complexity peptide in hydrophobic-polar encoding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,4.0,Coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.6666666666666666,3.0,Non-coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.6666666666666666,3.0,Non-coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,5.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.75,4.0,Non-coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,2.0,Coding
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.6666666666666666,3.0,Non-coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,3.0,Non-coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.75,4.0,Non-coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.6666666666666666,3.0,Non-coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,3.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.6666666666666666,3.0,Non-coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.6,5.0,Non-coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.25,4.0,Non-coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in hydrophobic-polar encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-21.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-21.csv
new file mode 100644
index 00000000..288d511c
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-21.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,2.0,Coding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.6666666666666666,3.0,Non-coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.5,2.0,Non-coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.5,2.0,Non-coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,4.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.6666666666666666,3.0,Non-coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.5,2.0,Non-coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,2.0,Non-coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.3333333333333333,3.0,Non-coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.5,2.0,Non-coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,2.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.5,2.0,Non-coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.5,4.0,Non-coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,3.0,Non-coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in hydrophobic-polar encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-22.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-22.csv
new file mode 100644
index 00000000..b80299c6
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-22.csv
@@ -0,0 +1,23 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,2.0,Non-coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,,0.0,Read length was shorter than 3 * preptide k-mer size
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,3.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.5,2.0,Non-coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,,0.0,Read length was shorter than 3 * preptide k-mer size
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.0,2.0,Non-coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,,,All translations shorter than peptide k-mer size + 1
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,0.0,Read length was shorter than 3 * preptide k-mer size
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,,0.0,Read length was shorter than 3 * preptide k-mer size
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.3333333333333333,3.0,Non-coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,2.0,Non-coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,2.0,Low complexity nucleotide
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,1.0,Low complexity peptide
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-7.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-7.csv
new file mode 100644
index 00000000..b35d3d49
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-7.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,12.0,Coding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,17.0,Coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,16.0,Coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,16.0,Coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,12.0,Coding
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,18.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,17.0,Coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,13.0,Coding
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,16.0,Coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,1.0,15.0,Coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,15.0,Coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,9.0,Coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,16.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,15.0,Coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,5.0,Coding
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,16.0,Coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,16.0,Coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in hydrophobic-polar encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-8.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-8.csv
new file mode 100644
index 00000000..a5e4645c
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-hydrophobic-polar_ksize-8.csv
@@ -0,0 +1,23 @@
+,read_id,jaccard_in_peptide_db,n_kmers,classification
+0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,12,coding
+1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,1.0,16,coding
+2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,1.0,15,coding
+3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,1.0,15,coding
+4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,1.0,11,coding
+5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,17,coding
+6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,1.0,16,coding
+7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,1.0,13,coding
+8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,1.0,15,coding
+9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,1.0,15,coding
+10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,1.0,15,coding
+11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,0.0,0,non-coding
+12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,1.0,10,coding
+13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,15,coding
+14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,1.0,15,coding
+15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,0.0,0,non-coding
+16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,0.0,0,non-coding
+17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,1.0,4,coding
+18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,1.0,16,coding
+19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,1.0,16,coding
+20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,-1.0,2,low complexity nucleotide
+21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,-2.0,7,low complexity peptide
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-6.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-6.csv
new file mode 100644
index 00000000..e7a27a76
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-6.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,17.0,Coding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.05555555555555555,18.0,Non-coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.17647058823529413,17.0,Non-coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.0625,16.0,Non-coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.07692307692307693,13.0,Non-coding
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,19.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.05555555555555555,18.0,Non-coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.125,16.0,Non-coding
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.0625,16.0,Non-coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.0,17.0,Non-coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.16666666666666666,18.0,Non-coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.17647058823529413,17.0,Non-coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,17.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,17.0,Non-coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,0.16666666666666666,6.0,Non-coding
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.15789473684210525,19.0,Non-coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.05555555555555555,18.0,Non-coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in protein encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-7.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-7.csv
new file mode 100644
index 00000000..1f9bdd5e
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-7.csv
@@ -0,0 +1,24 @@
+read_id,jaccard_in_peptide_db,n_kmers,classification
+SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,16.0,Coding
+SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,17.0,Non-coding
+SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.0,16.0,Non-coding
+SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.0,16.0,Non-coding
+SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.0,12.0,Non-coding
+SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,18.0,Coding
+SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.058823529411764705,17.0,Non-coding
+SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.0,14.0,Non-coding
+SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.0625,16.0,Non-coding
+SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.125,16.0,Non-coding
+SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.058823529411764705,17.0,Non-coding
+SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,,,No translation frames without stop codons
+SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.0625,16.0,Non-coding
+SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,16.0,Coding
+SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0625,16.0,Non-coding
+SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,,,No translation frames without stop codons
+SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,,,No translation frames without stop codons
+SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,0.0,5.0,Non-coding
+SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.1111111111111111,18.0,Non-coding
+SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.17647058823529413,17.0,Non-coding
+SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,,,Read length was shorter than 3 * peptide k-mer size
+SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,,,Read length was shorter than 3 * peptide k-mer size
+adversarial_low_complexity_peptide,,1.0,Low complexity peptide in protein encoding
diff --git a/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-8.csv b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-8.csv
new file mode 100644
index 00000000..6279d32a
--- /dev/null
+++ b/tests/data/extract_coding/SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22__molecule-protein_ksize-8.csv
@@ -0,0 +1,23 @@
+,read_id,jaccard_in_peptide_db,n_kmers,classification
+0,SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1,1.0,15,coding
+1,SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1,0.0,16,non-coding
+2,SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1,0.0,15,non-coding
+3,SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1,0.07142857142857142,14,non-coding
+4,SRR306838.13334230 Ibis_Run100924_C3PO:6:66:16579:20350/1,0.0,11,non-coding
+5,SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1,1.0,17,coding
+6,SRR306838.6813354 Ibis_Run100924_C3PO:6:32:10591:13073/1,0.0625,16,non-coding
+7,SRR306838.23113368 Ibis_Run100924_C3PO:6:114:13840:18459/1,0.07142857142857142,14,non-coding
+8,SRR306838.10872941 Ibis_Run100924_C3PO:6:53:6164:10522/1,0.07142857142857142,14,non-coding
+9,SRR306838.6192120 Ibis_Run100924_C3PO:6:29:5833:11991/1,0.06666666666666667,15,non-coding
+10,SRR306838.21295280 Ibis_Run100924_C3PO:6:106:2590:13965/1,0.125,16,non-coding
+11,SRR306838.21201208 Ibis_Run100924_C3PO:6:106:2763:5109/1,0.0,0,non-coding
+12,SRR306838.18327923 Ibis_Run100924_C3PO:6:92:9077:13885/1,0.06666666666666667,15,non-coding
+13,SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1,1.0,15,coding
+14,SRR306838.21417895 Ibis_Run100924_C3PO:6:107:8793:5012/1,0.0,15,non-coding
+15,SRR306838.17165743 Ibis_Run100924_C3PO:6:86:18789:18450/1,0.0,0,non-coding
+16,SRR306838.21229494 Ibis_Run100924_C3PO:6:106:6163:7753/1,0.0,0,non-coding
+17,SRR306838.21218773 Ibis_Run100924_C3PO:6:106:16921:6743/1,0.25,4,non-coding
+18,SRR306838.20124664 Ibis_Run100924_C3PO:6:101:4701:5309/1,0.0,17,non-coding
+19,SRR306838.16841308 Ibis_Run100924_C3PO:6:85:6205:5805/1,0.0,16,non-coding
+20,SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1,-1.0,2,low complexity nucleotide
+21,SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1,-2.0,7,low complexity peptide
diff --git a/tests/data/extract_coding/true_protein_coding.fasta b/tests/data/extract_coding/true_protein_coding.fasta
new file mode 100644
index 00000000..473153f2
--- /dev/null
+++ b/tests/data/extract_coding/true_protein_coding.fasta
@@ -0,0 +1,6 @@
+>SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1 translation_frame: -2 jaccard: 1.0
+TEQDLQLYCDFPNIIDVSIKQA
+>SRR306838.2740879 Ibis_Run100924_C3PO:6:13:11155:5248/1 translation_frame: -1 jaccard: 1.0
+QSSSPEFRVQSFSERTNARKKNNH
+>SRR306838.4880582 Ibis_Run100924_C3PO:6:23:17413:5436/1 translation_frame: 2 jaccard: 1.0
+LDPPYSRVITQRETENNQMTSE
diff --git a/tests/data/low_complexity_nucleotides.fastq b/tests/data/low_complexity_nucleotides.fastq
new file mode 100644
index 00000000..1413e5c6
--- /dev/null
+++ b/tests/data/low_complexity_nucleotides.fastq
@@ -0,0 +1,4 @@
+@SRR306838.1531 Ibis_Run100924_C3PO:6:1:15718:1062/1
+ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
++
+(04147:;:9<<:7;88<>=@>>8<;;<=;C;>;:5:;9<<::6@;E;?:C@=:9:67
diff --git a/tests/data/low_complexity_peptides.fastq b/tests/data/low_complexity_peptides.fastq
new file mode 100644
index 00000000..e2e72994
--- /dev/null
+++ b/tests/data/low_complexity_peptides.fastq
@@ -0,0 +1,4 @@
+@SRR306838.2318 Ibis_Run100924_C3PO:6:1:15779:1141/1
+CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCACACACCCCCAACACCC
++
+2263688B;9<<9;=;9<=><:;=:@<@<<;@;S5:;;M true_n_unique_kmers * 0.95
+ assert test.n_unique_kmers() < true_n_unique_kmers * 1.05
+
+
+def test_maybe_make_peptide_bloom_filter(peptide_bloom_filter_path,
+ molecule, peptide_ksize):
+ from khtools.bloom_filter import maybe_make_peptide_bloom_filter
+
+ maybe_make_peptide_bloom_filter(peptide_bloom_filter_path,
+ peptide_ksize,
+ molecule,
+ peptides_are_bloom_filter=True)
+ # No assertion, just check that it ran
+ # assert isinstance(test, khmer.Nodegraph)
+
+
+def test_cli_minimum(peptide_fasta):
+ from khtools.bloom_filter import cli
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ peptide_fasta,
+ ])
+ assert result.exit_code == 0
+
+
+def test_cli_options(peptide_fasta, molecule, peptide_ksize):
+ from khtools.bloom_filter import cli
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ '--peptide-ksize', peptide_ksize, '--molecule', molecule,
+ "--tablesize", "1e4",
+ peptide_fasta,
+ ])
+ assert result.exit_code == 0
+
+
+def test_get_peptide_ksize_default(molecule):
+ from khtools.bloom_filter import get_peptide_ksize, \
+ DEFAULT_PROTEIN_KSIZE, DEFAULT_HP_KSIZE, DEFAULT_DAYHOFF_KSIZE
+
+ test = get_peptide_ksize(molecule, peptide_ksize=None)
+ if molecule == 'protein':
+ assert test == DEFAULT_PROTEIN_KSIZE
+ elif molecule == 'dayhoff':
+ assert test == DEFAULT_DAYHOFF_KSIZE
+ elif molecule == 'hydrophobic-polar':
+ assert test == DEFAULT_HP_KSIZE
+
+
+def test_get_peptide_ksize_with_ksize(molecule):
+ from khtools.bloom_filter import get_peptide_ksize
+
+ peptide_ksize = 123
+ test = get_peptide_ksize(molecule, peptide_ksize)
+ assert test == peptide_ksize
+
+
+def test_get_peptide_ksize_with_bad_molecule():
+ from khtools.bloom_filter import get_peptide_ksize
+
+ peptide_ksize = 123
+ with pytest.raises(ValueError):
+ get_peptide_ksize("not a real molecule type", peptide_ksize)
diff --git a/khtools/tests/test_commandline.py b/tests/test_commandline.py
similarity index 99%
rename from khtools/tests/test_commandline.py
rename to tests/test_commandline.py
index 421a3adb..dc64d88e 100755
--- a/khtools/tests/test_commandline.py
+++ b/tests/test_commandline.py
@@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-
"""
test_commandline
----------------------------------
diff --git a/khtools/tests/test_compare_kmer_content.py b/tests/test_compare_kmer_content.py
similarity index 91%
rename from khtools/tests/test_compare_kmer_content.py
rename to tests/test_compare_kmer_content.py
index 0072f394..3ab5d7d8 100644
--- a/khtools/tests/test_compare_kmer_content.py
+++ b/tests/test_compare_kmer_content.py
@@ -66,9 +66,12 @@ def test_jaccardize(nucleotide_seq1):
def test_kmer_comparison_table(nucleotide_seq1, nucleotide_seq2, ksizes):
from khtools.compare_kmer_content import kmer_comparison_table
- test = kmer_comparison_table('seq1', nucleotide_seq1,
- 'seq2', nucleotide_seq2,
- 'nucleotide', ksizes=ksizes)
+ test = kmer_comparison_table('seq1',
+ nucleotide_seq1,
+ 'seq2',
+ nucleotide_seq2,
+ 'nucleotide',
+ ksizes=ksizes)
s = """id1,id2,ksize,jaccard,molecule
seq1,seq2,2,1.0,nucleotide
seq1,seq2,3,0.8,nucleotide
diff --git a/khtools/tests/test_ensembl.py b/tests/test_ensembl.py
similarity index 100%
rename from khtools/tests/test_ensembl.py
rename to tests/test_ensembl.py
diff --git a/tests/test_extract_coding.py b/tests/test_extract_coding.py
new file mode 100644
index 00000000..c6dcd87b
--- /dev/null
+++ b/tests/test_extract_coding.py
@@ -0,0 +1,329 @@
+import os
+import warnings
+
+from Bio.Seq import Seq
+from click.testing import CliRunner
+import pandas as pd
+import pandas.util.testing as pdt
+import pytest
+import screed
+
+
+@pytest.fixture
+def seq():
+ s = 'CGCTTGCTTAATACTGACATCAATAATATTAGGAAAATCGCAATATAACTGTAAATCCTGTTCTGTC'
+ with warnings.catch_warnings():
+ # Ignore The following warning because we don't use Bio.Alphabet
+ # explicitly:
+ # PendingDeprecationWarning: We intend to remove or replace
+ # Bio.Alphabet in 2020, ideally avoid using it explicitly in your
+ # code. Please get in touch if you will be adversely affected by this.
+ # https://github.com/biopython/biopython/issues/2046
+ warnings.simplefilter("ignore")
+ return Seq(s)
+
+
+@pytest.fixture
+def low_complexity_seq():
+ return "CCCCCCCCCACCACCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCA" \
+ "CACACCCCCAACACCC"
+
+
+@pytest.fixture(params=['seq', 'low_complexity_seq'])
+def type_seq(request, seq, low_complexity_seq):
+ if request.param == 'seq':
+ return request.param, seq
+ elif request.param == 'low_complexity_seq':
+ return request.param, low_complexity_seq
+
+
+def test_three_frame_translation(seq):
+ from khtools.extract_coding import three_frame_translation
+
+ test = [str(x) for x in three_frame_translation(seq)]
+ true = [
+ 'RLLNTDINNIRKIAI*L*ILFC', 'ACLILTSIILGKSQYNCKSCSV',
+ 'LA*Y*HQ*Y*ENRNITVNPVL'
+ ]
+ assert test == true
+
+
+def test_compute_fastp_low_complexity(type_seq):
+ from khtools.extract_coding import compute_fastp_complexity
+
+ seqtype, seq = type_seq
+ test = compute_fastp_complexity(seq)
+ if seqtype == 'seq':
+ assert test == 0.746268656716418
+ elif seqtype == 'low_complexity_seq':
+ assert test == 0.2631578947368421
+
+
+def test_evaluate_is_fastp_low_complexity(type_seq):
+ from khtools.extract_coding import evaluate_is_fastp_low_complexity
+
+ seqtype, seq = type_seq
+
+ test = evaluate_is_fastp_low_complexity(seq)
+ if seqtype == 'seq':
+ # regular sequence is not low complexity
+ assert not test
+ elif seqtype == 'low_complexity_seq':
+ # low complexity sequence should evaluate to low complexity!
+ assert test
+
+
+def test_three_frame_translation_no_stops(seq):
+ from khtools.extract_coding import three_frame_translation_no_stops
+
+ test = {
+ k: str(v)
+ for k, v in three_frame_translation_no_stops(seq).items()
+ }
+ true = {2: 'ACLILTSIILGKSQYNCKSCSV'}
+ assert test == true
+
+
+def test_six_frame_translation_no_stops(seq):
+ from khtools.extract_coding import six_frame_translation_no_stops
+
+ test = {k: str(v) for k, v in six_frame_translation_no_stops(seq).items()}
+ true = {
+ 2: 'ACLILTSIILGKSQYNCKSCSV',
+ -2: 'TEQDLQLYCDFPNIIDVSIKQA',
+ -3: 'QNRIYSYIAIFLILLMSVLSK'
+ }
+ assert test == true
+
+
+@pytest.fixture
+def reads(data_folder):
+ return os.path.join(
+ data_folder,
+ 'SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq')
+
+
+@pytest.fixture
+def true_scores_path(data_folder, molecule, peptide_ksize):
+ return os.path.join(
+ data_folder, "extract_coding",
+ "SRR306838_GSM752691_hsa_br_F_1_trimmed_"
+ f"subsampled_n22__molecule-{molecule}_ksize-"
+ f"{peptide_ksize}.csv")
+
+
+@pytest.fixture
+def true_scores(true_scores_path):
+ return pd.read_csv(true_scores_path)
+
+
+@pytest.fixture
+def true_protein_coding_fasta_path(data_folder):
+ return os.path.join(data_folder, "extract_coding",
+ "true_protein_coding.fasta")
+
+
+@pytest.fixture
+def true_protein_coding_fasta_string(true_protein_coding_fasta_path):
+ with open(true_protein_coding_fasta_path) as f:
+ return f.read()
+
+
+def test_score_reads(capsys, tmpdir, reads, peptide_bloom_filter, molecule,
+ true_scores, true_scores_path,
+ true_protein_coding_fasta_path):
+ from khtools.extract_coding import score_reads
+
+ test = score_reads(reads,
+ peptide_bloom_filter,
+ molecule=molecule)
+
+ # Check that scoring was the same
+ pdt.assert_equal(test, true_scores)
+
+ # --- Check fasta output --- #
+ captured = capsys.readouterr()
+ test_names = []
+ for line in captured.out.splitlines():
+ if line.startswith(">"):
+ test_names.append(line.lstrip('>'))
+
+ # Check that the proper sequences were output
+ true_names = get_fasta_record_names(true_protein_coding_fasta_path)
+
+ # Check that precision is high -- everything in "test" was truly coding
+ assert all(test_name in true_names for test_name in test_names)
+
+ captured_lines = captured.out.splitlines()
+ with open(true_protein_coding_fasta_path) as f:
+ for true_line in f.readlines():
+ assert true_line.strip() in captured_lines
+
+
+def write_fasta_string_to_file(fasta_string, folder, prefix):
+ test_fasta_filename = os.path.join(folder, prefix + '.fasta')
+ with open(test_fasta_filename) as f:
+ f.write(fasta_string)
+ return test_fasta_filename
+
+
+def get_fasta_record_names(fasta_path):
+ names = []
+ for record in screed.open(fasta_path):
+ name = record['name']
+ names.append(name)
+ return set(names)
+
+
+def test_cli_peptide_fasta(reads, peptide_fasta, molecule, peptide_ksize,
+ true_protein_coding_fasta_string):
+ from khtools.extract_coding import cli
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ '--peptide-ksize', peptide_ksize, '--molecule', molecule,
+ peptide_fasta, reads
+ ])
+ assert result.exit_code == 0
+ assert true_protein_coding_fasta_string in result.output
+
+
+def test_cli_bad_jaccard_threshold_float(reads, peptide_fasta):
+ from khtools.extract_coding import cli
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--jaccard-threshold", "3.14", peptide_fasta, reads
+ ])
+ assert result.exit_code == 2
+ error_message = 'Error: Invalid value for "--jaccard-threshold": ' \
+ '--jaccard-threshold needs to be a number between 0 ' \
+ 'and 1, but 3.14 was provided'
+ assert error_message in result.output
+
+
+def test_cli_bad_jaccard_threshold_string(reads, peptide_fasta):
+ from khtools.extract_coding import cli
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--jaccard-threshold", "beyonce", peptide_fasta, reads
+ ])
+ assert result.exit_code == 2
+ error_message = 'Error: Invalid value for "--jaccard-threshold": beyonce' \
+ ' is not a valid floating point value'
+ assert error_message in result.output
+
+
+def test_cli_peptide_bloom_filter(reads, peptide_bloom_filter_path, molecule,
+ peptide_ksize,
+ true_protein_coding_fasta_string):
+ from khtools.extract_coding import cli
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ '--peptide-ksize', peptide_ksize, "--peptides-are-bloom-filter",
+ '--molecule', molecule, peptide_bloom_filter_path, reads
+ ])
+ assert result.exit_code == 0
+ assert true_protein_coding_fasta_string in result.output
+
+
+def test_cli_csv(tmpdir, reads, peptide_bloom_filter_path, molecule,
+ peptide_ksize, true_protein_coding_fasta_string, true_scores):
+ from khtools.extract_coding import cli
+
+ csv = os.path.join(tmpdir, 'coding_scores.csv')
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ '--peptide-ksize', peptide_ksize, "--csv", csv,
+ "--peptides-are-bloom-filter", '--molecule', molecule,
+ peptide_bloom_filter_path, reads
+ ])
+ assert result.exit_code == 0
+ assert true_protein_coding_fasta_string in result.output
+ assert os.path.exists(csv)
+
+ # the CLI adds the filename to the scoring dataframe
+ true = true_scores.copy()
+ true['filename'] = reads
+
+ test_scores = pd.read_csv(csv)
+ pdt.assert_equal(test_scores, true)
+
+
+def test_cli_coding_nucleotide_fasta(tmpdir, reads, peptide_fasta):
+ from khtools.extract_coding import cli
+
+ coding_nucleotide_fasta = os.path.join(tmpdir, 'coding_nucleotides.fasta')
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--coding-nucleotide-fasta", coding_nucleotide_fasta,
+ peptide_fasta, reads
+ ])
+ assert result.exit_code == 0
+ assert os.path.exists(coding_nucleotide_fasta)
+
+
+def test_cli_noncoding_fasta(tmpdir, reads, peptide_fasta):
+ from khtools.extract_coding import cli
+
+ noncoding_nucleotide_fasta = os.path.join(tmpdir,
+ 'noncoding_nucleotides.fasta')
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--noncoding-nucleotide-fasta", noncoding_nucleotide_fasta,
+ peptide_fasta, reads
+ ])
+ assert result.exit_code == 0
+ assert os.path.exists(noncoding_nucleotide_fasta)
+
+
+def test_cli_low_complexity_nucleotide(tmpdir, reads, peptide_fasta):
+ from khtools.extract_coding import cli
+
+ low_complexity_nucleotide_fasta = os.path.join(
+ tmpdir, 'low_complexity_nucleotide.fasta')
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--low-complexity-nucleotide-fasta", low_complexity_nucleotide_fasta,
+ peptide_fasta, reads
+ ])
+ assert result.exit_code == 0
+ assert os.path.exists(low_complexity_nucleotide_fasta)
+
+
+def test_cli_low_complexity_peptide(
+ tmpdir,
+ reads,
+ peptide_fasta):
+ from khtools.extract_coding import cli
+
+ low_complexity_peptide_fasta = os.path.join(tmpdir,
+ 'low_complexity_peptide.fasta')
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--low-complexity-peptide-fasta", low_complexity_peptide_fasta,
+ peptide_fasta, reads
+ ])
+ assert result.exit_code == 0
+ assert os.path.exists(low_complexity_peptide_fasta)
+
+
+def test_cli_json_summary(tmpdir, reads, peptide_fasta):
+ from khtools.extract_coding import cli
+
+ json_summary = os.path.join(tmpdir, 'coding_summary.json')
+
+ runner = CliRunner()
+ result = runner.invoke(cli, [
+ "--json-summary", json_summary,
+ peptide_fasta, reads
+ ])
+ assert result.exit_code == 0
+ assert os.path.exists(json_summary)
diff --git a/khtools/tests/test_os_utils.py b/tests/test_os_utils.py
similarity index 100%
rename from khtools/tests/test_os_utils.py
rename to tests/test_os_utils.py
diff --git a/khtools/tests/test_sequence_encodings.py b/tests/test_sequence_encodings.py
similarity index 87%
rename from khtools/tests/test_sequence_encodings.py
rename to tests/test_sequence_encodings.py
index 7a3c2059..6a6b8dc8 100644
--- a/khtools/tests/test_sequence_encodings.py
+++ b/tests/test_sequence_encodings.py
@@ -61,6 +61,7 @@ def test_purine_pyrimidize(nucleotide_string):
true = 'RRYYRYR'
assert test == true
+
# -------------------- Test peptide encodings ---------------------------- #
@@ -94,3 +95,16 @@ def test_botvinnikize(peptide_string):
test = botvinnikize(peptide_string)
true = 'dadkacbfghf'
assert test == true
+
+
+def test_encode_peptide(peptide_string, molecule):
+ from khtools.sequence_encodings import encode_peptide
+
+ test = encode_peptide(peptide_string, molecule)
+ if molecule == 'dayhoff':
+ true = 'bbbdbfecdac'
+ elif molecule == 'hydrophobic-polar':
+ true = 'phpphhhpppp'
+ elif molecule == 'protein':
+ true = peptide_string
+ assert test == true