jdna

Source code for jdna.viewer

"""
Classes to view sequences.

The viewer can display sequences and annotations, as in the following:

.. code::


    > "Unnamed" (550bp)


                                                                ----------------GFP----------------
                                                                |<START
                                                                ----      -----------RFP-----------
    0         CCCAGGACTAGCGACTTTCCGTAACGCGACCTAACACCGGCCGTTCCTTCGAGCCAGGCAAATGTTACGTCACTTCCTTAGATTT
              GGGTCCTGATCGCTGAAAGGCATTGCGCTGGATTGTGGCCGGCAAGGAAGCTCGGTCCGTTTACAATGCAGTGAAGGAATCTAAA

              ------GFP------
              -----------------------------------------RFP-----------------------------------------
    85        TGAACAGCGCCGTACCCCGATATGATATTTAGATATATAGCAGTTACACTTGGGGTTGCTATGGACTTAGATCTGCTGTATGTTT
              ACTTGTCGCGGCATGGGGCTATACTATAAATCTATATATCGTCAATGTGAACCCCAACGATACCTGAATCTAGACGACATACAAA

              -----------------------------------------RFP-----------------------------------------
    170       TCTTACCTTCCGCATCAGGGGACAATTCGCCAGTAGAATTCAGTTTGTGCGTGAGAACATAAGATTGAATCCCACGCAGGCACAA
              AGAATGGAAGGCGTAGTCCCCTGTTAAGCGGTCATCTTAAGTCAAACACGCACTCTTGTATTCTAACTTAGGGTGCGTCCGTGTT

              ---------------------RFP----------------------
    255       GCAGGGCGGGCAGACTCTATAGGTCCTAAGACCCTGAGACTGCGTCCTCAAGATACAGGTTAACAATCCCCGTATGGAGCCGTTC
              CGTCCCGCCCGTCTGAGATATCCAGGATTCTGGGACTCTGACGCAGGAGTTCTATGTCCAATTGTTAGGGGCATACCTCGGCAAG

    340       TTAGCATGACCCGACAGGTGGGCTTGGCTCGCGTAAGTTGAGTGTTGCAGATACCTGCTGCTGCGCGGTCTAGGGGGAATCGCCG
              AATCGTACTGGGCTGTCCACCCGAACCGAGCGCATTCAACTCACAACGTCTATGGACGACGACGCGCCAGATCCCCCTTAGCGGC

    425       ATTTTGACGTAGGATCGGTAATGGGCAGTAAACCCGCAACTATTTTCAGCACCAGATGCAAGTTTCCCTAGAAAGCGTCATGGTT
              TAAAACTGCATCCTAGCCATTACCCGTCATTTGGGCGTTGATAAAAGTCGTGGTCTACGTTCAAAGGGATCTTTCGCAGTACCAA

    510       TGCAATCTCCTTAGGTCACAGCAAACATAGCAGCCCCTGT
              ACGTTAGAGGAATCCAGTGTCGTTTGTATCGTCGGGGACA
"""

import re
import functools
import itertools
from collections import OrderedDict

from networkx import nx

from jdna.utils import random_color, colored, colored_background


[docs]class StringColumn(object): """Class for managing string columns""" FILL = " " def __init__(self, strings=None, color=None, background=None, fill=None): """ StringColumn constructor. :param strings: list of strings :type strings: list """ self._strings = [] self._length = 0 if fill is None: fill = self.FILL self.fill = fill if strings: max_length = max([self.string_length(s) for s in strings]) self._length = max_length for s in strings: self.append_string(self.right_fill(s)) self.color = color self.background = background def apply_color(self, color): self._strings = [colored(s, color) for s in self._strings] return self def apply_background_color(self, color): self._strings = [colored_background(s, color) for s in self._strings] return self @property def length(self): return self._length @staticmethod def remove_formatting(string): pattern = "\\x1b\[\d\dm" return re.sub(pattern, "", string)
[docs] @classmethod def string_length(cls, string): """String length, ignoring terminal formatting""" subbed = cls.remove_formatting(string) l = len(subbed) return len(subbed)
@property def strings(self): return self._strings[:] def indent(self, num): sc_copy = self.copy() for i, s in enumerate(sc_copy.strings): sc_copy._strings[i] = self.fill * num + sc_copy._strings[i] sc_copy._length += num return sc_copy def indent_right(self, num): sc_copy = self.copy() for i, s in enumerate(sc_copy.strings): sc_copy._strings[i] = sc_copy._strings[i] + self.fill * num sc_copy._length += num return sc_copy def center(self, span): diff = span - self.length if diff > 0: l = int(diff / 2) r = l + diff % 2 return self.indent(l).indent_right(r) return self.copy() def flip(self): self._strings = self._strings[::-1] return self.copy() def right_fill(self, string): return string + self.fill * (self.length - self.string_length(string)) def prepend_string(self, new_string): if self.string_length(new_string) > self.length: self._length = self.string_length(new_string) self._strings.insert(0, self.right_fill(new_string)) def append_string(self, new_string): if self.string_length(new_string) > self.length: self._length = self.string_length(new_string) self._strings.append(self.right_fill(new_string)) def add_prefix(self, prefix): for i, s in self.strings: self.strings[i] = prefix + self.strings[i] def stack(self, *others): sc = self.copy() for other in others: for string in other.strings: sc.append_string(string) return sc def __contains__(self, item): return any([item in s for s in self.strings]) def __add__(self, other): if isinstance(other, str): other = StringColumn([other]) else: other = other.copy() sc = self.copy() diff = len(sc.strings) - len(other.strings) if diff > 0: for i in range(diff): other.prepend_string("") elif diff < 0: for i in range(-diff): sc.prepend_string("") new_sc = self.copy_empty() for this_string, other_string in zip(sc.strings, other.strings): new_sc.append_string(this_string + other_string) return new_sc def copy_empty(self): sc_copy = self.copy() sc_copy._strings = [] return sc_copy def copy(self): return self.__copy__() def strip_indices(self): n1 = 0 n2 = 0 for x in self[:]: if all([_x == " " for _x in x]): n1 += 1 else: break for x in self[::-1]: if all([_x == " " for _x in x]): n2 += 1 else: break return n1, len(self) - n2 def strip(self): n1, n2 = self.strip_indices() return self[n1:n2] def __copy__(self): copied = self.__class__( self.strings, color=self.color, background=self.background, fill=self.fill ) return copied def __getitem__(self, key): # strings = [self.remove_formatting(s) for s in self.strings] strings = [s.__getitem__(key) for s in self.strings] string_col = self.__class__(strings) return string_col # def __setitem__(self, key, items): # if not len(items) == len(self.strings): # raise TypeError("Value must have {} items".format(len(self.strings))) # for string, item in zip(self.strings, items): # string[key] = item def __eq__(self, other): return str(self) == str(other) def __iter__(self): return zip(*self.strings) def __len__(self): return self.length def __str__(self): s = "\n".join(self.strings) return s def __repr__(self): return str(self)
[docs] @classmethod def condense(cls, rows): """ Condense a list of :class:`StringColumn` into the minimum number of StringColumns comprising of columns stripped of white space. Briefly, this is similar to the following procedure: .. code-block:: input = [ 'label ', ' label2 ', ' label3 ' ] # >> CONDENSE output = [ 'label label2 ', ' label3 ' ] :param rows: :type rows: :return: :rtype: """ segments = [] indexed_segments = [] previous_end = 0 for row in rows: col = ( row.strip() .apply_color(row.color) .apply_background_color(row.background) ) start, end = row.strip_indices() word = (start, end, col) if word not in segments: segments.append(tuple(list(word))) indexed_segments.append((start, end, col, previous_end)) previous_end += 1 # create a graph of non-overlapping segments nonoverlap_graph = nx.Graph() for w in indexed_segments: nonoverlap_graph.add_node(w[-1]) for segment1, segment2 in itertools.combinations(indexed_segments, 2): start1, end1, _, index1 = segment1 start2, end2, _, index2 = segment2 if start1 < start2 or start1 > end2: if start2 < start1 or start2 > end1: if end1 < start2: nonoverlap_graph.add_edge(index1, index2) else: nonoverlap_graph.add_edge(index2, index1) # find minimum number of cliques that covers the graph (clique covering) subgraph = nonoverlap_graph.subgraph(nonoverlap_graph.nodes) cliques = [] while len(subgraph): max_clique = list(nx.find_cliques(subgraph))[0] cliques.append(max_clique) remaining = set(subgraph.nodes).difference(set(max_clique)) subgraph = nonoverlap_graph.subgraph(list(remaining)) condensed_rows = [] for clique in cliques: string_column = cls() clique_segments = [indexed_segments[s] for s in clique] clique_segments = sorted(clique_segments, key=lambda seg: seg[1]) previous_end = 0 for segment in clique_segments: start, end, seg_str_col, _ = segment string_column += seg_str_col.indent(start - previous_end) previous_end = end condensed_rows.append(string_column) return condensed_rows
[docs]def chunkify(iterable, n): """Break an interable into chunks of size at most 'n'""" chunk = None for i, x in enumerate(iterable): if i % n == 0: if chunk is not None: yield chunk chunk = [] chunk.append(x) yield chunk
[docs]def to_lines(string, width): """Converts a string to lines of length <= width""" lines = [] for i in range(0, len(string), width): lines.append(string[i : i + width]) return lines
[docs]def prepend_lines(lines, label_iterable, indent, fill=" ", align="<"): """ Prepend lines with a label :param lines: lines to prepend :type lines: list :param indent: number of spaces between start of label and start of line :type indent: int :param fill: default ' ' :type fill: what to fill the spaces :param align: either left "<", center "^" or right ">" :type align: string :return: new prepended lines :rtype: list """ prepend_pattern = functools.partial( "{0:{fill}{align}{indent}}".format, fill=fill, align=align, indent=indent ) new_lines = [] for label, line in zip(label_iterable, lines): new_lines.append("{}{}".format(prepend_pattern(label), line)) return new_lines
[docs]def indent(string, indent): """Indent lines""" lines = string.split("\n") new_lines = prepend_lines(lines, [""] * len(lines), indent) return "\n".join(new_lines)
# def set_indent(lines, indent): # """Reset the indent of lines""" # return indent([l.lstrip() for l in lines], indent) # # # def enumerate_lines(lines, indent): # """Enumerate lines""" # labels = range(len(lines)) # return prepend_lines(lines, labels, indent) # # # def accumulate_length_of_lines(lines, indent): # labels = itertools.accumulate([len(l.strip('\n')) for l in lines], operator.add) # return prepend_lines(lines, labels, indent) # # # def accumulate_length_of_first_line(lines, indent): # labels = itertools.accumulate([len(l.split('\n')[0].strip('\n')) for l in lines], operator.add) # return prepend_lines(lines, labels, indent)
[docs]class ViewerAnnotationFlag(object): """Flags for annotation directions""" FORWARD = ">" REVERSE = "<" BOTH = "-"
[docs]class SequenceRow(object): """A row in a :class:`SequenceViewer` instance. Can be comprised of multiple sequences (i.e. lines) and can be annotated with 'features'.""" def __init__( self, lines, labels, indent, start, end, line_colors=None, line_backgrounds=None ): """ SequenceRow constructor :param lines: list of lines to display. Lengths of all lines must all be equivalent. :type lines: list :param labels: list of labels to apply to each line :type labels: list :param indent: indent to apply to the lines :type indent: string :param start: start bp of this row :type start: int :param end: end bp of this row :type end: int """ lengths = set([len(r) for r in lines]) if len(lengths) > 1: raise Exception("Cannot format rows that have different lengths") self._lines = lines if isinstance(line_colors, str): line_colors = [line_colors] * len(lines) if isinstance(line_backgrounds, str): line_backgrounds = [line_backgrounds] * len(lines) self.line_colors = line_colors self.line_backgrounds = line_backgrounds self.labels = labels self.indent = indent self.start = start self.end = end self.annotations = [] self.bottom_annotations = [] @property def lines(self): lines = self._lines[:] if self.line_colors: lines = [ colored(line, color) for line, color in zip(lines, self.line_colors) ] if self.line_backgrounds: lines = [ colored_background(line, color) for line, color in zip(lines, self.line_backgrounds) ] return prepend_lines(lines, self.labels, self.indent) def annotation_lines(self, annotations): condensed = StringColumn.condense(annotations) return [str(a.indent(self.indent)) for a in condensed]
[docs] @staticmethod def make_annotation(label, span, fill="*", color=None, background=None): """ Make an annotation with 'label' spanning inclusive base pairs indices 'span' :param label: annotation label :type label: basestring :param span: the start and end (inclusive) of the annotation :type span: tuple :param fill: what to fill whitespace with :type fill: basestring :return: :rtype: """ if len(fill) != 1: raise Exception( "Fill '{}' must be a single character long, not {} characters".format( fill, len(fill) ) ) if fill.strip() == "": raise Exception("Fill cannot be whitespace") sc = StringColumn(color=color, background=background) if isinstance(label, str): if len(label) + 1 > span: sc.append_string(label) # sc.append_string("|<{0:{fill}{align}{indent}}".format(label, fill=' ', align='^', indent=span)) label = fill * span sc2 = StringColumn(fill=fill) sc2.append_string(label) return sc.stack(sc2.center(span)) # sc2.append_string("{0:{fill}{align}{indent}}".format(label, fill=fill, align='^', indent=span)) elif isinstance(label, StringColumn): if len(label) > span: sc = sc.stack(label) return sc.stack(StringColumn([""], fill=fill).center(span)) label.fill = fill return sc.stack(label.center(span))
[docs] def absolute_annotate( self, start, end, fill, label, color=None, background=None, top=True ): """ Applyt annotation to this row using absolute start and ends for THIS row. :param start: inclusive start :type start: int :param end: inclusive end :type end: int :param fill: what to fill whitespace with :type fill: basestring :param label: annotation label :type label: basestring :return: None :rtype: None """ span = end - start + 1 annotation = self.make_annotation( label, span, fill, color=color, background=background ).indent(start) if top: self.annotations.append(annotation) else: self.bottom_annotations.append(annotation)
[docs] def annotate( self, start, end, fill, label="", color=None, background=None, top=True, wrap=False, ): """ Annotate the sequence row. If 'start' or 'end' is beyond, the expected start or end for this row, the annotation will automatically be truncated. :param start: inclusive start :type start: int :param end: inclusive end :type end: int :param fill: what to fill whitespace with :type fill: :param label: optional label to apply to the annotation :type label: basestring :return: :rtype: """ s = max(start - self.start, 0) e = min(end - self.start, len(self) - 1) return self.absolute_annotate( s, e, fill, label, color=color, background=background, top=top )
[docs] def in_bounds(self, x): """ Checks if the index 'x' is in between row start and end (inclusive) :param x: index :type x: int :return: if in bounds :rtype: bool """ return x >= self.start and x <= self.end
def __len__(self): return len(self._lines[0]) def __str__(self): return "\n".join( self.annotation_lines(self.annotations) + self.lines + self.annotation_lines(self.bottom_annotations) )
# # class SequenceLabel(object): # # def __init__(self, indent, label=None, pattern=None, indexer=None): # self.indent = indent # self.index = 0 # self.label = label # if pattern is None: # pattern = "{label} {index}" # self.pattern = pattern # self.indexer = indexer # # def indexers(self): # return { # "line_length": lambda x: self.index + len(x), # "enumerate": lambda x: x + 1 # } # # def enumerate(self, line): # if self.indexer: # self.index += self.indexer(line) # # def __str__(self): # label = self.patter.format(index=self.index, label=self.label) # return "{0:{fill}{align}{indent}".format(label, fill=' ', align='<', indent=self.indent)
[docs]class SequenceViewer(object): """A class that views longs sets of sequences.""" class DEFAULTS: METADATA_INDENT = 2 INDENT = 10 SPACER = "\n" HEADER_SPACER = "\n" WIDTH = 85 NAME = "Unnamed" DESCRIPTION = "" BACKGROUND_COLOR = None FOREGROUND_COLOR = None APPLY_INDICES = [0] RANDOM_COLOR = "RANDOM" def __init__( self, sequences, sequence_labels=None, apply_indices=DEFAULTS.APPLY_INDICES, foreground_colors=DEFAULTS.FOREGROUND_COLOR, background_colors=DEFAULTS.BACKGROUND_COLOR, indent=DEFAULTS.INDENT, width=DEFAULTS.WIDTH, spacer=DEFAULTS.SPACER, header_spacer=DEFAULTS.HEADER_SPACER, name=DEFAULTS.NAME, window=(0, None), description="", metadata=None, ): """ SequenceViewer constructor :param sequences: list of sequences to view :type sequences: list :param sequence_labels: optional labels to apply to sequence. Include the '{index}' to enumerate the base pairs. :type sequence_labels: list :param foreground_colors: optional list base pair foreground colors (hex or common name) to apply to each sequence. If a string is provided, color will be applied to all sequences. If provided with "RANDOM", a random color will be assigned to each sequence. :type foreground_colors: list :param background_colors: optional list base pair background colors (hex or common name) to apply to each sequence. Usage is analogous to `foreground_colors` parameter. :type background_colors: list :param indent: spacing before start of string and start of base pairs :type indent: int :param width: width of the view window for the sequences (e.g. width=100 would mean rows of at most len 100 characters :type width: string :param spacer: string to apply inbetween rows (default is newline) :type spacer: string :param name: optional name for this viewer, to be displayed in the header :type name: basestring :param window: tuple of the start and end points of the viewing window :type window: tuple :param description: optional description for this viewer :type description: basestring :param metadata: optional metadata to display in the header :type metadata: dict """ assert isinstance(sequences, list) seq_lens = set([len(s) for s in sequences]) if len(seq_lens) > 1: raise Exception( "Sequence must be same length but found lengths {}".format( [len(s) for s in sequences] ) ) self.annotations = [] self.window = window self._sequences = tuple([str(s) for s in sequences]) if sequence_labels is None: sequence_labels = [""] * len(sequences) for i in apply_indices: sequence_labels[i] = "{index} " + sequence_labels[i] if foreground_colors == self.RANDOM_COLOR: foreground_colors = [random_color() for _ in sequences] self.foreground_colors = foreground_colors if background_colors == self.RANDOM_COLOR: background_colors = [random_color() for _ in sequences] self.background_colors = background_colors self.sequence_labels = sequence_labels self.indent = indent self.width = width self.spacer = spacer self.header_spacer = header_spacer if name is None: name = self.DEFAULTS.NAME self.name = name self.metadata = OrderedDict() if description: self.metadata["Description"] = self.DEFAULTS.DESCRIPTION if hasattr(self.sequences[0], "cyclic"): self.metadata["Cyclic"] = self.sequences[0].cyclic if metadata is not None: self.metadata.update(metadata)
[docs] def set_window(self, start, end): """Sets the inclusive viewing window""" self.window = (start, end) return self
@property def sequences(self): return list(self._sequences) @property def header(self): """Return the formatted header and metadata""" metadata = "\n".join( "{key}: {val}".format(key=key, val=val) for key, val in self.metadata.items() ) metadata = indent(metadata, self.DEFAULTS.METADATA_INDENT) return '> "{name}" ({length}bp)\n{metadata}'.format( name=self.name, length=len(self), metadata=metadata ) @property def rows(self): lines = [] for seq in self.sequences: line = to_lines(str(seq)[self.window[0] : self.window[1]], width=self.width) lines.append(line) interleafed = functools.reduce(lambda x, y: x + y, zip(*lines)) chunks = chunkify(interleafed, len(self.sequences)) rows = [] index = self.window[0] for chunk in chunks: labels = [str(l).format(index=index) for l in self.sequence_labels] rows.append( SequenceRow( chunk, labels, self.indent, index, min(index + self.width - 1, len(self)), line_colors=self.foreground_colors, line_backgrounds=self.background_colors, ) ) index += len(chunk[0]) self._annotate_rows(rows) return rows
[docs] def annotate( self, start, end, label=None, fill=None, color=None, background=None, top=True ): """ Annotates this viewer object starting from 'start' to 'end' inclusively. :param start: inclusive start :type start: int :param end: inclusive end :type end: int :param label: optional label to apply to the annotation :type label: basestring | StringColumn :param fill: the fill character to use to (e.g. '<', '>', '^') to fill in whitespace :type fill: string :param color: the foreground color to apply to the annotation (hex or common name) :type color: string :param background: the foreground color to apply to the annotation (hex or common name) :type background: string :return: None :rtype: None """ if fill is None: fill = ViewerAnnotationFlag.BOTH if label is None: label = "" self.annotations.append( dict( start=start, end=end, label=label, fill=str(fill), color=color, background=background, top=top, ) )
[docs] def _annotate_rows(self, rows): """Annotate the rows using the viewer's annotations""" for a in self.annotations: for row in rows: if a["end"] >= row.start and a["start"] <= row.end: row.annotate(**a) return rows
def print(self): print(str(self)) def __len__(self): return len(self.sequences[0]) def __str__(self): spacer = self.spacer if spacer is None: spacer = "" s = "{header}\n".format(header=self.header) s += self.header_spacer s += "\n{}".format(spacer).join([str(r) for r in self.rows]) return s
[docs]class FASTAItem(SequenceViewer): def __init__(self, sequence): super().__init__( [sequence], indent=0, width=80, name=sequence.name, apply_indices=[], sequence_labels=[""], spacer="", header_spacer="", ) @property def header(self): return ">{}".format(self.name)
class FASTAViewer(object): def __init__(self, sequences): self.views = [FASTAItem(sequence) for sequence in sequences] def __str__(self): return "\n\n".join(str(v) for v in self.views) def print(self): print(str(self))