Python: Analyse Logs and report top N common matches

Tags: #python #logs
Python Analyse Logs and report top N common matches.py

"""
Example:
    python logs.py -i "10 seconds ago"
    python logs.py -i "10 seconds ago" -x "now"
    python logs.py -i "10 seconds ago" -x "now" -u "obiwan"
    python logs.py --min "today at 9.20" --max "today at 10.20" --upstream "obiwan"
""" # noqa

import argparse
import re
import subprocess

from collections import Counter
from datetime import datetime

parser = argparse.ArgumentParser(description="Process Site Router - nginx - Logs")
parser.add_argument("-i", "--min", type=str, required=True, help="When to start getting logs") # noqa
parser.add_argument("-x", "--max", type=str, default="now", help="When to stop getting logs") # noqa
parser.add_argument("-u", "--upstream", type=str, default="bpager", help="Filter logs by this upstream") # noqa
args = parser.parse_args()


def upstreams(logs):
    result = set()

    for line in logs:
        match = re.search("upstream \(([^:]+)", line)
        if match:
            result.add(match.groups(0)[0])

    return sorted(result)


def filter_logs_by_upstream(logs, upstream):
    log_lines_keep = []

    for line in logs:
        result = re.search("upstream \(([^:]+)", line)
        if result and result.groups(0)[0] == upstream:
            log_lines_keep.append(line)

    return log_lines_keep


def filter_logs_by_gets_that_were_errors(logs):
    log_lines_gets = []

    for line in logs:
        result = re.search("GET /.+ HTTP/1.1 [4-5][0-9]{2}", line)
        if result:
            log_lines_gets.append(line)

    return log_lines_gets


def parse_log_urls(logs):
    if len(logs) < 1:
        return "no 4xx/5xx errors in the provided log data time period"

    log_processed = []

    for line in logs:
        result = re.search("(GET /.+ HTTP/1.1 [4-5][0-9]{2})", line)
        if result:
            log_processed.append(result.groups(0)[0])

    return sorted(log_processed)


def timeit(fn):
    def helper():
        start = datetime.now()
        print("log retrieval started:", start)

        logs = fn()

        finish = datetime.now()
        print(f"log retrieval finished: {finish}\n")

        difference = finish - start
        time_taken = f"{round(difference.seconds / 60)}mins" if difference.seconds >= 60 else f"{difference.seconds}s" # noqa

        print(f"time taken to retrieve the logs: {time_taken}\n")
        print(f"number of log lines: {len(logs)}")

        print("upstreams found within the logs:\n")
        for upstream in upstreams(logs):
            print(f"\t{upstream}")
        print("\n")

        return logs

    return helper


@timeit
def process():
    command = [
        "papertrail",
        "-g",
        "rig-web-public",
        f'--min-time="{args.min}"',
        f'--max-time="{args.max}"',
        "program:web-public/site_router/"
    ]

    output = subprocess.run(command, stdout=subprocess.PIPE)
    logs = output.stdout.decode("utf-8").splitlines()

    return logs


# Acquire a set of logs for the specified time period
logs = process()

# Filter logs so we only keep those for a specific upstream
logs_keep = filter_logs_by_upstream(logs, args.upstream)
print(f"logs we're keeping (upstream: {args.upstream}): {len(logs_keep)}")

# Filter logs so we only keep 4xx/5xx GET requests
logs_gets = filter_logs_by_gets_that_were_errors(logs_keep)
print(f"logs after filtering for 4xx/5xx GET requests: {len(logs_gets)}\n")

# Track the top 10 4xx/5xx URLs
result_for_parse_log_urls = parse_log_urls(logs_gets)
count = Counter(result_for_parse_log_urls)
for item in count.most_common(10):
    print(f"{item[0]}\ncount: {item[1]}\n")