#!/usr/bin/python3
# -*- coding: utf-8 -*-

# %% re

"""
Most letters and characters will simply match themselves.
For example, the regular expression test will match the string test exactly.

Metacharacters:
. ^ $ * + ? { } [ ] \ | ( )

r'string of mine' is a raw string in Python

match()
Determine if the RE matches at the beginning of the string.

search()
Scan through a string, looking for any location where this RE matches.

findall()
Find all substrings where the RE matches, and returns them as a list.

finditer()
Find all substrings where the RE matches, and returns them as an iterator.

match() and search() return None if no match can be found,
findall returns and empty list, and finditer still returns and iterator.

The returned object:
group()
Return the string matched by the RE

start()
Return the starting position of the match

end()
Return the ending position of the match

span()
Return a tuple containing the (start, end) positions of the match
"""

import re

pattern = r"Cookie"
sequence = "Cookies are good"
diditmatch = re.search(pattern, sequence)
print(diditmatch.group())
print(diditmatch.start())
print(diditmatch.end())
print(diditmatch.span())
if re.search(pattern, sequence):
    print("Match!")
else:
    print("Not a match!")

#  A period. Matches any single character except newline character.
print(re.search(r"Co.k.e", "Cookie").group())

print(re.findall("[Hh]ello", "Hello world, hello Python,!"))

iterThing = re.finditer("[Hh]ello", "Hello world, hello Python,!")
print([iterObj.group() for iterObj in iterThing])

# Match is like search, but only at the beginning of a string
mo = re.match("hello", "Hello world, hello Python!")
print(mo)

mo = re.match("Hello", "Hello world, hello Python!")
print(mo)

# Lowercase w. Matches any single letter, digit or underscore.
print(re.search(r"Co\wk\we", "Cookie").group())

# Uppercase w. Matches any character not part of \w (lowercase w).
print(re.search(r"C\Wke", "C@ke").group())

# Lowercase s. Matches a single whitespace character like:
# space, newline, tab, return.
print(re.search(r"Eat\scake", "Eat cake").group())

# Lowercase d. Matches decimal digit 0-9.
print(re.search(r"c\d\dkie", "c00kie").group())

# Caret. Matches a pattern at the start of the string.
print(re.search(r"^Eat", "Eat cake").group())

# $ - Matches a pattern at the end of string.
print(re.search(r"cake$", "Eat cake").group())

# [abc] - Matches a or b or c.
# [a-zA-Z0-9] - Matches any letter from (a to z) or (A to Z) or (0 to 9).
# Characters that are not within a range can be matched by complementing
# the set. If the first character of the set is ^, all the characters that are
# not in the set will be matched.
print(re.search(r"[abc]", "Eat cake").group())

# Uppercase a. Matches only at the start of the string.
# Works across multiple lines as well.
print(re.search(r"\A[A-E]ookie", "Cookie").group())

# + - Checks for one or more characters to its left.
print(re.search(r"Co+kie", "Cooookie").group())

# * - Checks for zero or more characters to its left.
# Checks for any occurrence of a or o or both in the given sequence
print(re.search(r"Ca*o*kie", "Cookie").group())

# ? - Checks for exactly zero or one character to its left.
# Checks for exactly zero or one occurrence of a or o
# or both in the given sequence
print(re.search(r"Colou?r", "Color").group())

# {x} - Repeat exactly x number of times.
# {x,} - Repeat at least x times or more.
# {x, y} - Repeat at least x times but no more than y times.
print(re.search(r"\d{9,10}", "0987654321").group())

# substitute sub(thingtofind, thingtoreplaceitwith, string)
print(re.sub("lobster", "thing", "The lobster was tasty"))

# You can "compile" these for speed and re-use
# https://stackoverflow.com/questions/452104/is-it-worth-using-pythons-re-compile
p = re.compile("[a-z]+")
m = p.search("::::: word")
print(m.start())
print(m.end())
print(m.span())
print(m.group())


p = re.compile("[a-z]+")
m = p.search("::::: word")
if m:
    print("Match found: ", m.group())
else:
    print("No match")


p = re.compile("\d+")
print(p.findall("12 drummers drumming, 11 pipers piping, 10 lords a-leaping"))


# search for email addresses
hand = open("mbox-short.txt")
for line in hand:
    line = line.rstrip()
    x = re.findall("[a-zA-Z0-9]\S+@\S+[a-zA-Z]", line)
    if len(x) > 0:
        print(x)


seq = "ATATAAGATGCGCGCGCTTATGCGCGCA"
rgx = re.compile("TAT")
i = 1
for mo in rgx.finditer(seq):
    print("Ocurrence %s: %s" % (i, mo.group()))
    print("Position: From %s to %s" % (mo.start(), mo.end()))
    i += 1

seq = "ATATAAGATGCGCGCGCTTATGCGCGCA"
rgx = re.compile("(GC){3,}")
result = rgx.search(seq)
print(result.group())


# Search for open reading frame
openpat = re.compile(
    """
                     ([TCAG]{3})*?
                     (ATG
                      ([TCAG]{3})*?
                      )
                     (TAA|TGA|TAG)
                     """,
    re.I | re.X,
)

# sub replace
p = re.compile("(blue|white|red)")
print(p.sub("colour", "blue socks and red shoes"))

# The subn() method does the same work,
# but returns a 2-tuple containing the new string value and the number of
# replacements that were performed:
p = re.compile("(blue|white|red)")
print(p.subn("colour", "blue socks and red shoes"))


# Delete GC repeats (more than 3 GC in a row)
regex = re.compile("(?:GC){3,}")
seq = "ATGATCGTACTGCGCGCTTCATGTGATGCGCGCGCGCAGACTATAAG"
print("Before:", seq)
print("After:", regex.sub("", seq))