Kmp

interview_workbook/strings /app/src/interview_workbook/strings/kmp.py
View Source

Algorithm Notes

Summary: Kmp — notes not yet curated.
Time: Estimate via loops/recurrences; common classes: O(1), O(log n), O(n), O(n log n), O(n^2)
Space: Count auxiliary structures and recursion depth.
Tip: See the Big-O Guide for how to derive bounds and compare trade-offs.

Big-O Guide

Source

def build_lps_array(pattern: str) -> list[int]:
    """
    Build the Longest Proper Prefix which is also Suffix (LPS) array.

    Time: O(m) where m = len(pattern)
    Space: O(m)

    The LPS array is the key to KMP's efficiency.
    lps[i] = length of longest proper prefix of pattern[0..i] which is also suffix.

    Interview follow-ups:
    - Why is this O(m) and not O(m²)? (The while loop doesn't reset j to 0)
    - What's a "proper" prefix? (Prefix that's not the entire string)
    - How does this help in pattern matching? (Tells us how far to skip)
    """
    m = len(pattern)
    lps = [0] * m
    length = 0  # Length of previous longest prefix suffix
    i = 1

    while i < m:
        if pattern[i] == pattern[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length != 0:
                # Don't increment i here, we need to check pattern[i] again
                length = lps[length - 1]
            else:
                lps[i] = 0
                i += 1

    return lps


def kmp_search(text: str, pattern: str) -> list[int]:
    """
    Knuth-Morris-Pratt string matching algorithm.

    Time: O(n + m) where n = len(text), m = len(pattern)
    Space: O(m) for LPS array

    Returns: List of starting indices where pattern is found in text

    Key insight: When mismatch occurs, use LPS array to determine
    how many characters we can skip instead of starting over.

    Interview follow-ups:
    - Why is this better than naive? (No backtracking in text)
    - When would you use this vs other algorithms? (Long patterns, multiple searches)
    - What if pattern is longer than text? (Return empty list)
    """
    if not pattern or not text:
        return []

    n, m = len(text), len(pattern)
    if m > n:
        return []

    # Build LPS array
    lps = build_lps_array(pattern)

    matches = []
    i = 0  # Index for text
    j = 0  # Index for pattern

    while i < n:
        if text[i] == pattern[j]:
            i += 1
            j += 1

        if j == m:
            # Found a match
            matches.append(i - j)
            j = lps[j - 1]
        elif i < n and text[i] != pattern[j]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1

    return matches


def kmp_search_first_occurrence(text: str, pattern: str) -> int:
    """
    Find first occurrence of pattern in text using KMP.

    Returns: Index of first match, or -1 if not found
    """
    matches = kmp_search(text, pattern)
    return matches[0] if matches else -1


def naive_string_search(text: str, pattern: str) -> list[int]:
    """
    Naive string matching for comparison.

    Time: O(n * m) worst case
    Space: O(1)

    This is what you'd naturally write, but it's inefficient for large inputs.
    """
    if not pattern or not text:
        return []

    n, m = len(text), len(pattern)
    matches = []

    for i in range(n - m + 1):
        j = 0
        while j < m and text[i + j] == pattern[j]:
            j += 1

        if j == m:
            matches.append(i)

    return matches


def kmp_count_occurrences(text: str, pattern: str) -> int:
    """Count total occurrences of pattern in text."""
    return len(kmp_search(text, pattern))


def kmp_search_case_insensitive(text: str, pattern: str) -> list[int]:
    """Case-insensitive KMP search."""
    return kmp_search(text.lower(), pattern.lower())


def find_period_of_string(s: str) -> int:
    """
    Find the shortest period of a string using KMP's LPS array.

    A string has period p if s[i] = s[i + p] for all valid i.

    Time: O(n)
    Space: O(n)

    Example: "abcabcabc" has period 3
    """
    n = len(s)
    lps = build_lps_array(s)

    # The period is n - lps[n-1]
    period = n - lps[n - 1]

    # Check if this is actually a period (string might not be periodic)
    if n % period == 0:
        return period
    else:
        return n  # String is not periodic, period is the entire length


def is_rotation(s1: str, s2: str) -> bool:
    """
    Check if s2 is a rotation of s1 using KMP.

    Key insight: s2 is a rotation of s1 iff s2 is a substring of s1 + s1

    Time: O(n)
    Space: O(n)

    Example: "waterbottle" is a rotation of "erbottlewat"
    """
    if len(s1) != len(s2):
        return False

    if len(s1) == 0:
        return True

    # Check if s2 is substring of s1 + s1
    doubled = s1 + s1
    return kmp_search_first_occurrence(doubled, s2) != -1


def longest_prefix_suffix(s: str) -> int:
    """
    Find length of longest proper prefix which is also suffix.

    This is just the last element of the LPS array.
    """
    if not s:
        return 0

    lps = build_lps_array(s)
    return lps[-1]


def kmp_multiple_patterns(text: str, patterns: list[str]) -> dict[str, list[int]]:
    """
    Search for multiple patterns in text using KMP.

    For many patterns, consider using Aho-Corasick algorithm instead.
    """
    results = {}
    for pattern in patterns:
        results[pattern] = kmp_search(text, pattern)
    return results


def demo():
    """Demo function for KMP algorithm."""
    print("KMP String Matching Demo")
    print("=" * 40)

    # Basic pattern matching
    text = "ABABDABACDABABCABCABCABCABC"
    pattern = "ABABCABCABCABC"

    print(f"Text: {text}")
    print(f"Pattern: {pattern}")
    print()

    # Show LPS array construction
    lps = build_lps_array(pattern)
    print("LPS array construction:")
    print(f"Pattern: {pattern}")
    print(f"LPS:     {lps}")
    print()

    # Compare KMP vs naive
    kmp_matches = kmp_search(text, pattern)
    naive_matches = naive_string_search(text, pattern)

    print(f"KMP matches: {kmp_matches}")
    print(f"Naive matches: {naive_matches}")
    print(f"Match: {kmp_matches == naive_matches}")
    print()

    # Performance comparison for repeated pattern
    repeated_text = "AAAAAAAAAAAAAAAAAAAAAAAAB"
    repeated_pattern = "AAAAB"

    print("Repeated pattern example:")
    print(f"Text: {repeated_text}")
    print(f"Pattern: {repeated_pattern}")

    kmp_result = kmp_search(repeated_text, repeated_pattern)
    naive_result = naive_string_search(repeated_text, repeated_pattern)

    print(f"KMP result: {kmp_result}")
    print(f"Naive result: {naive_result}")
    print()

    # String period detection
    periodic_strings = ["abcabcabc", "aaaa", "abcdef", "abababab"]

    print("String period detection:")
    for s in periodic_strings:
        period = find_period_of_string(s)
        print(f"'{s}' has period {period}")
    print()

    # Rotation detection
    rotation_pairs = [
        ("waterbottle", "erbottlewat"),
        ("abcde", "cdeab"),
        ("abcde", "abced"),
        ("", ""),
    ]

    print("Rotation detection:")
    for s1, s2 in rotation_pairs:
        is_rot = is_rotation(s1, s2)
        print(f"'{s2}' is rotation of '{s1}': {is_rot}")
    print()

    # Multiple patterns
    multi_text = "she sells seashells by the seashore"
    multi_patterns = ["she", "sea", "shell", "shore"]

    multi_results = kmp_multiple_patterns(multi_text, multi_patterns)
    print(f"Multiple pattern search in: '{multi_text}'")
    for pattern, matches in multi_results.items():
        print(f"  '{pattern}': {matches}")
    print()

    # Edge cases
    print("Edge cases:")
    edge_cases = [("", "abc"), ("abc", ""), ("abc", "abcd"), ("same", "same")]

    for text, pattern in edge_cases:
        result = kmp_search(text, pattern)
        print(f"Text: '{text}', Pattern: '{pattern}' -> {result}")


if __name__ == "__main__":
    demo()