Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions portal-tools/delete_stale_records.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[db]
hostname =
port = 3306
username =
password =
schema_name =
# Name of the table to purge stale records from
table_name = NormalisedSummaries

[common]
# Timeframe (in hours) used to define the "recent" data window
# Records newer than (MAX(UpdateTime) - timeframe) will be preserved
timeframe = 24
# Minimum number of records required within the timeframe to allow deletion
# If preserved records < threshold, no deletion will occur
threshold =

[logging]
logfile = /var/log/apel/delete_stale_records.log
247 changes: 247 additions & 0 deletions portal-tools/delete_stale_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
"""
APEL - DELETE stale records from the DB

This script connects to a MySQL database and deletes stale records from a specified table.
Records are only deleted if the number of recent entries (within a user-defined timeframe)
meets or exceeds a threshold.

The script supports `--dry-run` mode.

NOTE:
Before running this script, ensure the following:

- A valid configuration file (e.g. delete_stale_records.cfg) is present and accessible.
- A log file path (e.g. delete_stale_records.log) is defined either in the config
or via the --log_config argument.
- The directory for the log file exists and is writable.

You can override the default paths using:
--db /path/to/delete_stale_records.cfg
--log_config /path/to/delete_stale_records.log

Usage:
python delete_stale_records.py
python delete_stale_records.py --dry-run
python delete_stale_records.py --db /path/to/delete_stale_records.cfg --log_config /path/to/delete_stale_records.log
"""

# Requirements:
# mysqlclient==2.1.1 # Latest package; works with Python 3.9+. Dropped support for Python 3.6.
# Installation examples:
# - For Python 3.6: python3.6 -m pip install mysqlclient==2.1.1
# - For Python 3.9: python3.9 -m pip install mysqlclient==2.1.1

import logging
import os
import sys
from argparse import ArgumentParser
from configparser import ConfigParser, NoOptionError, NoSectionError
from datetime import timedelta

import MySQLdb
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a space between built in and 3rd-party imports.


# Constants
LOG_BREAK = '====================='
# Default config and log file paths
DEFAULT_DB_CONFIG_PATH = '/etc/apel/delete_stale_records.cfg'
DEFAULT_LOGFILE_PATH = '/var/log/apel/delete_stale_records.log'


def delete_stale_records(config_parser, args):
"""
Deletes stale records from the configured MySQL table based on UpdateTime.

Records older than (MAX(UpdateTime) - timeframe) are eligible for deletion,
but only if the number of recent records (within timeframe) meets the threshold.

Parameters:
config_parser: Parsed configuration object.
args: Parsed command-line arguments.
"""
conn = None
cursor = None

try:
# Extract database details from the "*.cfg" file
db_config = {
'host': config_parser.get('db', 'hostname'),
'port': config_parser.getint('db', 'port'),
'user': config_parser.get('db', 'username'),
'password': config_parser.get('db', 'password'),
'schema_name': config_parser.get('db', 'schema_name'),
'table_name': config_parser.get('db', 'table_name'),
'timeframe': config_parser.getint('common', 'timeframe'),
'threshold': config_parser.getint('common', 'threshold'),
}

# Ensure no required string is empty
for key in ['host', 'port', 'user', 'password', 'schema_name', 'table_name']:
if not db_config[key].strip():
raise ValueError(f"'{key}' in config is empty")

# Numeric validations
if db_config['timeframe'] <= 0:
raise ValueError("'timeframe' must be > 0")
if db_config['threshold'] <= 0:
raise ValueError("'threshold' must be > 0")

except (NoSectionError, NoOptionError) as e:
print(f"Configuration error: {e}")
sys.exit(1)
except ValueError as e:
print(f"Invalid configuration value: {e}")
sys.exit(1)

try:
# Connect to MySQL
conn = MySQLdb.connect(
host=db_config['host'],
port=db_config['port'],
user=db_config['user'],
passwd=db_config['password'],
db=db_config['schema_name']
)
cursor = conn.cursor()

# Verify UpdateTime column exists
cursor.execute(
f"SHOW COLUMNS FROM {db_config['table_name']} LIKE 'UpdateTime'"
)

if cursor.fetchone() is None:
log.error(
f"'UpdateTime' column not found in table "
f"'{db_config['table_name']}'. Aborting operation."
)
return

# Get the latest UpdateTime value
cursor.execute(
f"SELECT MAX(UpdateTime) FROM {db_config['table_name']}"
)
result = cursor.fetchone()

if not result or not result[0]:
log.warning("No UpdateTime values found. Nothing to purge.")
return

max_update = result[0]
cutoff_time = max_update - timedelta(hours=db_config['timeframe'])
cutoff_str = cutoff_time.strftime('%Y-%m-%d %H:%M:%S')

# Count preserved records
cursor.execute(
f"SELECT COUNT(*) FROM {db_config['table_name']} WHERE UpdateTime >= %s",
(cutoff_str,)
)
preserved_count = cursor.fetchone()[0]

# Count deletable records
cursor.execute(
f"SELECT COUNT(*) FROM {db_config['table_name']} WHERE UpdateTime < %s",
(cutoff_str,)
)
deletable_count = cursor.fetchone()[0]

# Summary log
summary = (
f"[SUMMARY] Table: {db_config['table_name']} | "
f"Timeframe: {db_config['timeframe']}h | "
f"Threshold: {db_config['threshold']} | "
f"Preserved: {preserved_count} | "
f"Deletable: {deletable_count} | "
f"Cutoff: {cutoff_str}"
)
log.info(summary)

# Threshold check
if preserved_count < db_config['threshold']:
log.warning(
f"Preserved records ({preserved_count}) below threshold "
f"({db_config['threshold']}). Skipping deletion."
)
return


# Perform deletion or simulate
if args.dry_run:
log.info(
f"DRY_RUN: {deletable_count} rows would be deleted from the "
f"'{db_config['table_name']}' (UpdateTime < {cutoff_str})."
)
else:
cursor.execute(
f"DELETE FROM {db_config['table_name']} WHERE UpdateTime < %s",
(cutoff_str,)
)
deleted_rows = cursor.rowcount
conn.commit()
log.info(
f"{deleted_rows} rows deleted successfully from the "
f"'{db_config['table_name']}' (UpdateTime < {cutoff_str})."
)

except MySQLdb.Error as err:
log.error(f"MySQL error: {err}")
if conn:
conn.rollback()
log.warning("Transaction rolled back due to error.")

finally:
if cursor:
cursor.close()
if conn:
conn.close()


if __name__ == '__main__':
# Parse CLI arguments
parser = ArgumentParser(description='Delete stale records from APEL DB.')
parser.add_argument(
'-d', '--db',
help='Location of DB config file',
default=DEFAULT_DB_CONFIG_PATH
)
parser.add_argument(
'-l', '--log_config',
help='Location of logging config file',
default=None
)
parser.add_argument(
'--dry-run',
dest='dry_run',
action='store_true',
help='Preview deletions without executing'
)
args = parser.parse_args()

# Load configuration file
config_parser = ConfigParser()
read_files = config_parser.read(args.db)
if not read_files:
print(f"ERROR: Failed to read config file: {args.db}")
sys.exit(1)

# Ensure log directory exists
LOGFILE_PATH = args.log_config or config_parser.get('logging', 'logfile', fallback=DEFAULT_LOGFILE_PATH)
LOG_DIR = os.path.dirname(LOGFILE_PATH)
if not os.path.isdir(LOG_DIR):
print(f"ERROR: Log directory does not exist: {LOG_DIR}")
sys.exit(1)

# Configure logger
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

if args.dry_run:
handler = logging.StreamHandler(sys.stdout)
else:
handler = logging.FileHandler(LOGFILE_PATH)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
log.addHandler(handler)

log.info(f"{LOG_BREAK}\nStarting APEL DB Purge Script\n")

delete_stale_records(config_parser, args)