Source code for jdna.viewer

"""
Classes to view sequences.

The viewer can display sequences and annotations, as in the following:

.. code::


    > "Unnamed" (550bp)


                                                                ----------------GFP----------------
                                                                |<START
                                                                ----      -----------RFP-----------
    0         CCCAGGACTAGCGACTTTCCGTAACGCGACCTAACACCGGCCGTTCCTTCGAGCCAGGCAAATGTTACGTCACTTCCTTAGATTT
              GGGTCCTGATCGCTGAAAGGCATTGCGCTGGATTGTGGCCGGCAAGGAAGCTCGGTCCGTTTACAATGCAGTGAAGGAATCTAAA

              ------GFP------
              -----------------------------------------RFP-----------------------------------------
    85        TGAACAGCGCCGTACCCCGATATGATATTTAGATATATAGCAGTTACACTTGGGGTTGCTATGGACTTAGATCTGCTGTATGTTT
              ACTTGTCGCGGCATGGGGCTATACTATAAATCTATATATCGTCAATGTGAACCCCAACGATACCTGAATCTAGACGACATACAAA

              -----------------------------------------RFP-----------------------------------------
    170       TCTTACCTTCCGCATCAGGGGACAATTCGCCAGTAGAATTCAGTTTGTGCGTGAGAACATAAGATTGAATCCCACGCAGGCACAA
              AGAATGGAAGGCGTAGTCCCCTGTTAAGCGGTCATCTTAAGTCAAACACGCACTCTTGTATTCTAACTTAGGGTGCGTCCGTGTT

              ---------------------RFP----------------------
    255       GCAGGGCGGGCAGACTCTATAGGTCCTAAGACCCTGAGACTGCGTCCTCAAGATACAGGTTAACAATCCCCGTATGGAGCCGTTC
              CGTCCCGCCCGTCTGAGATATCCAGGATTCTGGGACTCTGACGCAGGAGTTCTATGTCCAATTGTTAGGGGCATACCTCGGCAAG

    340       TTAGCATGACCCGACAGGTGGGCTTGGCTCGCGTAAGTTGAGTGTTGCAGATACCTGCTGCTGCGCGGTCTAGGGGGAATCGCCG
              AATCGTACTGGGCTGTCCACCCGAACCGAGCGCATTCAACTCACAACGTCTATGGACGACGACGCGCCAGATCCCCCTTAGCGGC

    425       ATTTTGACGTAGGATCGGTAATGGGCAGTAAACCCGCAACTATTTTCAGCACCAGATGCAAGTTTCCCTAGAAAGCGTCATGGTT
              TAAAACTGCATCCTAGCCATTACCCGTCATTTGGGCGTTGATAAAAGTCGTGGTCTACGTTCAAAGGGATCTTTCGCAGTACCAA

    510       TGCAATCTCCTTAGGTCACAGCAAACATAGCAGCCCCTGT
              ACGTTAGAGGAATCCAGTGTCGTTTGTATCGTCGGGGACA
"""

import re
import functools
import itertools
from collections import OrderedDict

from networkx import nx

from jdna.utils import random_color, colored, colored_background


[docs]class StringColumn(object):
    """Class for managing string columns"""

    FILL = " "

    def __init__(self, strings=None, color=None, background=None, fill=None):
        """
        StringColumn constructor.

        :param strings: list of strings
        :type strings: list
        """
        self._strings = []
        self._length = 0
        if fill is None:
            fill = self.FILL
        self.fill = fill
        if strings:
            max_length = max([self.string_length(s) for s in strings])
            self._length = max_length
            for s in strings:
                self.append_string(self.right_fill(s))
        self.color = color
        self.background = background

    def apply_color(self, color):
        self._strings = [colored(s, color) for s in self._strings]
        return self

    def apply_background_color(self, color):
        self._strings = [colored_background(s, color) for s in self._strings]
        return self

    @property
    def length(self):
        return self._length

    @staticmethod
    def remove_formatting(string):
        pattern = "\\x1b\[\d\dm"
        return re.sub(pattern, "", string)

[docs]    @classmethod
    def string_length(cls, string):
        """String length, ignoring terminal formatting"""
        subbed = cls.remove_formatting(string)
        l = len(subbed)
        return len(subbed)

    @property
    def strings(self):
        return self._strings[:]

    def indent(self, num):
        sc_copy = self.copy()
        for i, s in enumerate(sc_copy.strings):
            sc_copy._strings[i] = self.fill * num + sc_copy._strings[i]
        sc_copy._length += num
        return sc_copy

    def indent_right(self, num):
        sc_copy = self.copy()
        for i, s in enumerate(sc_copy.strings):
            sc_copy._strings[i] = sc_copy._strings[i] + self.fill * num
        sc_copy._length += num
        return sc_copy

    def center(self, span):
        diff = span - self.length
        if diff > 0:
            l = int(diff / 2)
            r = l + diff % 2
            return self.indent(l).indent_right(r)
        return self.copy()

    def flip(self):
        self._strings = self._strings[::-1]
        return self.copy()

    def right_fill(self, string):
        return string + self.fill * (self.length - self.string_length(string))

    def prepend_string(self, new_string):
        if self.string_length(new_string) > self.length:
            self._length = self.string_length(new_string)
        self._strings.insert(0, self.right_fill(new_string))

    def append_string(self, new_string):
        if self.string_length(new_string) > self.length:
            self._length = self.string_length(new_string)
        self._strings.append(self.right_fill(new_string))

    def add_prefix(self, prefix):
        for i, s in self.strings:
            self.strings[i] = prefix + self.strings[i]

    def stack(self, *others):
        sc = self.copy()
        for other in others:
            for string in other.strings:
                sc.append_string(string)
        return sc

    def __contains__(self, item):
        return any([item in s for s in self.strings])

    def __add__(self, other):
        if isinstance(other, str):
            other = StringColumn([other])
        else:
            other = other.copy()
        sc = self.copy()

        diff = len(sc.strings) - len(other.strings)
        if diff > 0:
            for i in range(diff):
                other.prepend_string("")
        elif diff < 0:
            for i in range(-diff):
                sc.prepend_string("")

        new_sc = self.copy_empty()
        for this_string, other_string in zip(sc.strings, other.strings):
            new_sc.append_string(this_string + other_string)
        return new_sc

    def copy_empty(self):
        sc_copy = self.copy()
        sc_copy._strings = []
        return sc_copy

    def copy(self):
        return self.__copy__()

    def strip_indices(self):
        n1 = 0
        n2 = 0
        for x in self[:]:
            if all([_x == " " for _x in x]):
                n1 += 1
            else:
                break
        for x in self[::-1]:
            if all([_x == " " for _x in x]):
                n2 += 1
            else:
                break
        return n1, len(self) - n2

    def strip(self):
        n1, n2 = self.strip_indices()
        return self[n1:n2]

    def __copy__(self):
        copied = self.__class__(
            self.strings, color=self.color, background=self.background, fill=self.fill
        )
        return copied

    def __getitem__(self, key):
        # strings = [self.remove_formatting(s) for s in self.strings]
        strings = [s.__getitem__(key) for s in self.strings]
        string_col = self.__class__(strings)
        return string_col

    #     def __setitem__(self, key, items):
    #         if not len(items) == len(self.strings):
    #             raise TypeError("Value must have {} items".format(len(self.strings)))
    #         for string, item in zip(self.strings, items):
    #             string[key] = item

    def __eq__(self, other):
        return str(self) == str(other)

    def __iter__(self):
        return zip(*self.strings)

    def __len__(self):
        return self.length

    def __str__(self):
        s = "\n".join(self.strings)
        return s

    def __repr__(self):
        return str(self)

[docs]    @classmethod
    def condense(cls, rows):
        """
        Condense a list of :class:`StringColumn` into the minimum number of StringColumns comprising of columns stripped
        of white space. Briefly, this is similar to the following procedure:

        .. code-block::

            input = [
                'label         ',
                '       label2 ',
                '      label3  '
            ]

            # >> CONDENSE

            output = [
                'label  label2 ',
                '      label3  '
            ]

        :param rows:
        :type rows:
        :return:
        :rtype:
        """
        segments = []
        indexed_segments = []
        previous_end = 0
        for row in rows:
            col = (
                row.strip()
                .apply_color(row.color)
                .apply_background_color(row.background)
            )
            start, end = row.strip_indices()
            word = (start, end, col)
            if word not in segments:
                segments.append(tuple(list(word)))
                indexed_segments.append((start, end, col, previous_end))
                previous_end += 1

        # create a graph of non-overlapping segments
        nonoverlap_graph = nx.Graph()
        for w in indexed_segments:
            nonoverlap_graph.add_node(w[-1])
        for segment1, segment2 in itertools.combinations(indexed_segments, 2):
            start1, end1, _, index1 = segment1
            start2, end2, _, index2 = segment2
            if start1 < start2 or start1 > end2:
                if start2 < start1 or start2 > end1:
                    if end1 < start2:
                        nonoverlap_graph.add_edge(index1, index2)
                    else:
                        nonoverlap_graph.add_edge(index2, index1)

        # find minimum number of cliques that covers the graph (clique covering)
        subgraph = nonoverlap_graph.subgraph(nonoverlap_graph.nodes)
        cliques = []
        while len(subgraph):
            max_clique = list(nx.find_cliques(subgraph))[0]
            cliques.append(max_clique)
            remaining = set(subgraph.nodes).difference(set(max_clique))
            subgraph = nonoverlap_graph.subgraph(list(remaining))

        condensed_rows = []
        for clique in cliques:
            string_column = cls()
            clique_segments = [indexed_segments[s] for s in clique]
            clique_segments = sorted(clique_segments, key=lambda seg: seg[1])

            previous_end = 0
            for segment in clique_segments:
                start, end, seg_str_col, _ = segment
                string_column += seg_str_col.indent(start - previous_end)
                previous_end = end
            condensed_rows.append(string_column)
        return condensed_rows


[docs]def chunkify(iterable, n):
    """Break an interable into chunks of size at most 'n'"""
    chunk = None
    for i, x in enumerate(iterable):
        if i % n == 0:
            if chunk is not None:
                yield chunk
            chunk = []
        chunk.append(x)
    yield chunk


[docs]def to_lines(string, width):
    """Converts a string to lines of length <= width"""
    lines = []
    for i in range(0, len(string), width):
        lines.append(string[i : i + width])
    return lines


[docs]def prepend_lines(lines, label_iterable, indent, fill=" ", align="<"):
    """
    Prepend lines with a label

    :param lines: lines to prepend
    :type lines: list
    :param indent: number of spaces between start of label and start of line
    :type indent: int
    :param fill: default ' '
    :type fill: what to fill the spaces
    :param align: either left "<", center "^" or right ">"
    :type align: string
    :return: new prepended lines
    :rtype: list
    """
    prepend_pattern = functools.partial(
        "{0:{fill}{align}{indent}}".format, fill=fill, align=align, indent=indent
    )
    new_lines = []
    for label, line in zip(label_iterable, lines):
        new_lines.append("{}{}".format(prepend_pattern(label), line))
    return new_lines


[docs]def indent(string, indent):
    """Indent lines"""
    lines = string.split("\n")
    new_lines = prepend_lines(lines, [""] * len(lines), indent)
    return "\n".join(new_lines)


# def set_indent(lines, indent):
#     """Reset the indent of lines"""
#     return indent([l.lstrip() for l in lines], indent)
#
#
# def enumerate_lines(lines, indent):
#     """Enumerate lines"""
#     labels = range(len(lines))
#     return prepend_lines(lines, labels, indent)
#
#
# def accumulate_length_of_lines(lines, indent):
#     labels = itertools.accumulate([len(l.strip('\n')) for l in lines], operator.add)
#     return prepend_lines(lines, labels, indent)
#
#
# def accumulate_length_of_first_line(lines, indent):
#     labels = itertools.accumulate([len(l.split('\n')[0].strip('\n')) for l in lines], operator.add)
#     return prepend_lines(lines, labels, indent)


[docs]class ViewerAnnotationFlag(object):
    """Flags for annotation directions"""

    FORWARD = ">"
    REVERSE = "<"
    BOTH = "-"


[docs]class SequenceRow(object):
    """A row in a :class:`SequenceViewer` instance. Can be comprised of multiple sequences (i.e. lines)
    and can be annotated with 'features'."""

    def __init__(
        self, lines, labels, indent, start, end, line_colors=None, line_backgrounds=None
    ):
        """
        SequenceRow constructor

        :param lines: list of lines to display. Lengths of all lines must all be equivalent.
        :type lines: list
        :param labels: list of labels to apply to each line
        :type labels: list
        :param indent: indent to apply to the lines
        :type indent: string
        :param start: start bp of this row
        :type start: int
        :param end: end bp of this row
        :type end: int
        """
        lengths = set([len(r) for r in lines])
        if len(lengths) > 1:
            raise Exception("Cannot format rows that have different lengths")
        self._lines = lines
        if isinstance(line_colors, str):
            line_colors = [line_colors] * len(lines)
        if isinstance(line_backgrounds, str):
            line_backgrounds = [line_backgrounds] * len(lines)
        self.line_colors = line_colors
        self.line_backgrounds = line_backgrounds
        self.labels = labels
        self.indent = indent
        self.start = start
        self.end = end
        self.annotations = []
        self.bottom_annotations = []

    @property
    def lines(self):
        lines = self._lines[:]
        if self.line_colors:
            lines = [
                colored(line, color) for line, color in zip(lines, self.line_colors)
            ]
        if self.line_backgrounds:
            lines = [
                colored_background(line, color)
                for line, color in zip(lines, self.line_backgrounds)
            ]
        return prepend_lines(lines, self.labels, self.indent)

    def annotation_lines(self, annotations):
        condensed = StringColumn.condense(annotations)
        return [str(a.indent(self.indent)) for a in condensed]

[docs]    @staticmethod
    def make_annotation(label, span, fill="*", color=None, background=None):
        """
        Make an annotation with 'label' spanning inclusive base pairs indices 'span'

        :param label: annotation label
        :type label: basestring
        :param span: the start and end (inclusive) of the annotation
        :type span: tuple
        :param fill: what to fill whitespace with
        :type fill: basestring
        :return:
        :rtype:
        """

        if len(fill) != 1:
            raise Exception(
                "Fill '{}' must be a single character long, not {} characters".format(
                    fill, len(fill)
                )
            )
        if fill.strip() == "":
            raise Exception("Fill cannot be whitespace")
        sc = StringColumn(color=color, background=background)
        if isinstance(label, str):
            if len(label) + 1 > span:
                sc.append_string(label)
                # sc.append_string("|<{0:{fill}{align}{indent}}".format(label, fill=' ', align='^', indent=span))
                label = fill * span
            sc2 = StringColumn(fill=fill)
            sc2.append_string(label)
            return sc.stack(sc2.center(span))
            # sc2.append_string("{0:{fill}{align}{indent}}".format(label, fill=fill, align='^', indent=span))
        elif isinstance(label, StringColumn):
            if len(label) > span:
                sc = sc.stack(label)
                return sc.stack(StringColumn([""], fill=fill).center(span))
            label.fill = fill
            return sc.stack(label.center(span))

[docs]    def absolute_annotate(
        self, start, end, fill, label, color=None, background=None, top=True
    ):
        """
        Applyt annotation to this row using absolute start and ends for
        THIS row.

        :param start: inclusive start
        :type start: int
        :param end: inclusive end
        :type end: int
        :param fill: what to fill whitespace with
        :type fill: basestring
        :param label: annotation label
        :type label: basestring
        :return: None
        :rtype: None
        """
        span = end - start + 1
        annotation = self.make_annotation(
            label, span, fill, color=color, background=background
        ).indent(start)
        if top:
            self.annotations.append(annotation)
        else:
            self.bottom_annotations.append(annotation)

[docs]    def annotate(
        self,
        start,
        end,
        fill,
        label="",
        color=None,
        background=None,
        top=True,
        wrap=False,
    ):
        """
        Annotate the sequence row. If 'start' or 'end' is beyond,
        the expected start or end for this row, the annotation will
        automatically be truncated.

        :param start: inclusive start
        :type start: int
        :param end: inclusive end
        :type end: int
        :param fill: what to fill whitespace with
        :type fill:
        :param label: optional label to apply to the annotation
        :type label: basestring
        :return:
        :rtype:
        """
        s = max(start - self.start, 0)
        e = min(end - self.start, len(self) - 1)
        return self.absolute_annotate(
            s, e, fill, label, color=color, background=background, top=top
        )

[docs]    def in_bounds(self, x):
        """
        Checks if the index 'x' is in between row start and end (inclusive)

        :param x: index
        :type x: int
        :return: if in bounds
        :rtype: bool
        """
        return x >= self.start and x <= self.end

    def __len__(self):
        return len(self._lines[0])

    def __str__(self):
        return "\n".join(
            self.annotation_lines(self.annotations)
            + self.lines
            + self.annotation_lines(self.bottom_annotations)
        )


#
# class SequenceLabel(object):
#
#     def __init__(self, indent, label=None, pattern=None, indexer=None):
#         self.indent = indent
#         self.index = 0
#         self.label = label
#         if pattern is None:
#             pattern = "{label} {index}"
#         self.pattern = pattern
#         self.indexer = indexer
#
#     def indexers(self):
#         return {
#             "line_length": lambda x: self.index + len(x),
#             "enumerate": lambda x: x + 1
#         }
#
#     def enumerate(self, line):
#         if self.indexer:
#             self.index += self.indexer(line)
#
#     def __str__(self):
#         label = self.patter.format(index=self.index, label=self.label)
#         return "{0:{fill}{align}{indent}".format(label, fill=' ', align='<', indent=self.indent)


[docs]class SequenceViewer(object):
    """A class that views longs sets of sequences."""

    class DEFAULTS:
        METADATA_INDENT = 2
        INDENT = 10
        SPACER = "\n"
        HEADER_SPACER = "\n"
        WIDTH = 85
        NAME = "Unnamed"
        DESCRIPTION = ""
        BACKGROUND_COLOR = None
        FOREGROUND_COLOR = None
        APPLY_INDICES = [0]

    RANDOM_COLOR = "RANDOM"

    def __init__(
        self,
        sequences,
        sequence_labels=None,
        apply_indices=DEFAULTS.APPLY_INDICES,
        foreground_colors=DEFAULTS.FOREGROUND_COLOR,
        background_colors=DEFAULTS.BACKGROUND_COLOR,
        indent=DEFAULTS.INDENT,
        width=DEFAULTS.WIDTH,
        spacer=DEFAULTS.SPACER,
        header_spacer=DEFAULTS.HEADER_SPACER,
        name=DEFAULTS.NAME,
        window=(0, None),
        description="",
        metadata=None,
    ):
        """
        SequenceViewer constructor

        :param sequences: list of sequences to view
        :type sequences: list
        :param sequence_labels: optional labels to apply to sequence. Include the '{index}' to enumerate the base pairs.
        :type sequence_labels: list
        :param foreground_colors: optional list base pair foreground colors (hex or common name) to apply to each sequence. If a string
                                    is provided, color will be applied to all sequences. If provided with "RANDOM",
                                    a random color will be assigned to each sequence.
        :type foreground_colors: list
        :param background_colors: optional list base pair background colors (hex or common name) to apply to each sequence.
                                    Usage is analogous to `foreground_colors` parameter.
        :type background_colors: list
        :param indent: spacing before start of string and start of base pairs
        :type indent: int
        :param width: width of the view window for the sequences (e.g. width=100 would mean rows of at most len 100
                        characters
        :type width: string
        :param spacer: string to apply inbetween rows (default is newline)
        :type spacer: string
        :param name: optional name for this viewer, to be displayed in the header
        :type name: basestring
        :param window: tuple of the start and end points of the viewing window
        :type window: tuple
        :param description: optional description for this viewer
        :type description: basestring
        :param metadata: optional metadata to display in the header
        :type metadata: dict
        """
        assert isinstance(sequences, list)
        seq_lens = set([len(s) for s in sequences])
        if len(seq_lens) > 1:
            raise Exception(
                "Sequence must be same length but found lengths {}".format(
                    [len(s) for s in sequences]
                )
            )

        self.annotations = []

        self.window = window
        self._sequences = tuple([str(s) for s in sequences])
        if sequence_labels is None:
            sequence_labels = [""] * len(sequences)
        for i in apply_indices:
            sequence_labels[i] = "{index} " + sequence_labels[i]

        if foreground_colors == self.RANDOM_COLOR:
            foreground_colors = [random_color() for _ in sequences]
        self.foreground_colors = foreground_colors
        if background_colors == self.RANDOM_COLOR:
            background_colors = [random_color() for _ in sequences]
        self.background_colors = background_colors

        self.sequence_labels = sequence_labels
        self.indent = indent
        self.width = width
        self.spacer = spacer
        self.header_spacer = header_spacer
        if name is None:
            name = self.DEFAULTS.NAME
        self.name = name
        self.metadata = OrderedDict()
        if description:
            self.metadata["Description"] = self.DEFAULTS.DESCRIPTION
        if hasattr(self.sequences[0], "cyclic"):
            self.metadata["Cyclic"] = self.sequences[0].cyclic
        if metadata is not None:
            self.metadata.update(metadata)

[docs]    def set_window(self, start, end):
        """Sets the inclusive viewing window"""
        self.window = (start, end)
        return self

    @property
    def sequences(self):
        return list(self._sequences)

    @property
    def header(self):
        """Return the formatted header and metadata"""
        metadata = "\n".join(
            "{key}: {val}".format(key=key, val=val)
            for key, val in self.metadata.items()
        )
        metadata = indent(metadata, self.DEFAULTS.METADATA_INDENT)
        return '> "{name}" ({length}bp)\n{metadata}'.format(
            name=self.name, length=len(self), metadata=metadata
        )

    @property
    def rows(self):
        lines = []
        for seq in self.sequences:
            line = to_lines(str(seq)[self.window[0] : self.window[1]], width=self.width)
            lines.append(line)
        interleafed = functools.reduce(lambda x, y: x + y, zip(*lines))
        chunks = chunkify(interleafed, len(self.sequences))
        rows = []
        index = self.window[0]
        for chunk in chunks:
            labels = [str(l).format(index=index) for l in self.sequence_labels]
            rows.append(
                SequenceRow(
                    chunk,
                    labels,
                    self.indent,
                    index,
                    min(index + self.width - 1, len(self)),
                    line_colors=self.foreground_colors,
                    line_backgrounds=self.background_colors,
                )
            )
            index += len(chunk[0])
        self._annotate_rows(rows)
        return rows

[docs]    def annotate(
        self, start, end, label=None, fill=None, color=None, background=None, top=True
    ):
        """
        Annotates this viewer object starting from 'start' to 'end' inclusively.

        :param start: inclusive start
        :type start: int
        :param end: inclusive end
        :type end: int
        :param label: optional label to apply to the annotation
        :type label: basestring | StringColumn
        :param fill: the fill character to use to (e.g. '<', '>', '^') to fill in whitespace
        :type fill: string
        :param color: the foreground color to apply to the annotation (hex or common name)
        :type color: string
        :param background: the foreground color to apply to the annotation (hex or common name)
        :type background: string
        :return: None
        :rtype: None
        """
        if fill is None:
            fill = ViewerAnnotationFlag.BOTH
        if label is None:
            label = ""
        self.annotations.append(
            dict(
                start=start,
                end=end,
                label=label,
                fill=str(fill),
                color=color,
                background=background,
                top=top,
            )
        )

[docs]    def _annotate_rows(self, rows):
        """Annotate the rows using the viewer's annotations"""
        for a in self.annotations:
            for row in rows:
                if a["end"] >= row.start and a["start"] <= row.end:
                    row.annotate(**a)
        return rows

    def print(self):
        print(str(self))

    def __len__(self):
        return len(self.sequences[0])

    def __str__(self):
        spacer = self.spacer
        if spacer is None:
            spacer = ""
        s = "{header}\n".format(header=self.header)
        s += self.header_spacer
        s += "\n{}".format(spacer).join([str(r) for r in self.rows])
        return s


[docs]class FASTAItem(SequenceViewer):
    def __init__(self, sequence):
        super().__init__(
            [sequence],
            indent=0,
            width=80,
            name=sequence.name,
            apply_indices=[],
            sequence_labels=[""],
            spacer="",
            header_spacer="",
        )

    @property
    def header(self):
        return ">{}".format(self.name)


class FASTAViewer(object):
    def __init__(self, sequences):
        self.views = [FASTAItem(sequence) for sequence in sequences]

    def __str__(self):
        return "\n\n".join(str(v) for v in self.views)

    def print(self):
        print(str(self))
jdna

Table Of Contents

Source code for jdna.viewer