#!/usr/bin/env python3

import gzip

PREV_FILENAME = 'ids-uniprotkb-2025sep1.gz'
CURRENT_FILENAME = 'ids-uniprotkb-current.gz'
    
prev_identifiers = set()
curr_identifiers = set()

with gzip.open(PREV_FILENAME, 'rt') as prev:
    prev_lines = 0
    for prev_line in prev:
        prev_identifiers.add(prev_line.strip())
        prev_lines += 1
    print(f"Loaded {PREV_FILENAME}: {len(prev_identifiers)} found on {prev_lines} lines.")

with gzip.open(CURRENT_FILENAME, 'rt') as curr:
    curr_lines = 0
    for curr_line in curr:
        curr_identifiers.add(curr_line.strip())
        curr_lines += 1
    print(f"Loaded {CURRENT_FILENAME}: {len(curr_identifiers)} found on {curr_lines} lines.")

prev_but_not_current = prev_identifiers - curr_identifiers 
prev_but_not_current_lined = "\n".join(sorted(prev_but_not_current))
with gzip.open('prev_but_not_current.txt.gz', 'wt') as fout:
    fout.write(prev_but_not_current_lined)
print(f"Found {len(prev_but_not_current)} identifiers previously but not in current.")

current_but_not_prev = curr_identifiers - prev_identifiers
current_but_not_prev_lined = "\n".join(sorted(current_but_not_prev))
with gzip.open('current_but_not_prev.txt.gz', 'wt') as fout:
    fout.write(current_but_not_prev_lined)
print(f"Found {len(current_but_not_prev)} identifiers in current but not in previous.")

