Skip to content

Commit bc8f49c

Browse files
committed
Add script to delete stale records from the DB
1 parent b1be3ec commit bc8f49c

File tree

2 files changed

+289
-0
lines changed

2 files changed

+289
-0
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[db]
2+
# type of database
3+
backend = mysql
4+
# host with database
5+
hostname =
6+
# port to connect to
7+
port = 3306
8+
# database name
9+
name =
10+
# database user
11+
username =
12+
# password for database
13+
password =
14+
# Name of the table to purge stale records from
15+
table_name = VNormalisedSummaries
16+
17+
[common]
18+
# Timeframe (in hours) used to define the "recent" data window
19+
# Records newer than (MAX(UpdateTime) - timeframe) will be preserved
20+
timeframe = 24
21+
# Minimum number of records required within the timeframe to allow deletion
22+
# If preserved records < threshold, no deletion will occur
23+
threshold =
24+
25+
[logging]
26+
logfile = /var/log/apel/delete_stale_records.log
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
"""
2+
APEL - DELETE stale records from the DB
3+
4+
This script connects to a MySQL database and deletes stale records from a specified table.
5+
Records are only deleted if the number of recent entries (within a user-defined timeframe)
6+
meets or exceeds a threshold.
7+
8+
The script supports dry_run mode.
9+
10+
NOTE:
11+
Before running this script, ensure the following:
12+
13+
- A valid configuration file (e.g. delete_stale_records.cfg) is present and accessible.
14+
- A log file path (e.g. delete_stale_records.log) is defined either in the config
15+
or via the --log_config argument.
16+
- The directory for the log file exists and is writable.
17+
18+
You can override the default paths using:
19+
--db /path/to/delete_stale_records.cfg
20+
--log_config /path/to/delete_stale_records.log
21+
22+
Usage:
23+
python delete_stale_records.py
24+
python delete_stale_records.py --dry_run
25+
python delete_stale_records.py --db /path/to/delete_stale_records.cfg --log_config /path/to/delete_stale_records.log
26+
"""
27+
28+
# Requirements:
29+
# mysqlclient==2.1.1 # Latest package; works with Python 3.9+. Dropped support for Python 3.6.
30+
# Installation examples:
31+
# - For Python 3.6: python3.6 -m pip install mysqlclient==2.0.3
32+
# - For Python 3.9: python3.9 -m pip install mysqlclient==2.1.1
33+
34+
import os
35+
import sys
36+
import logging
37+
from datetime import timedelta
38+
from argparse import ArgumentParser
39+
from configparser import ConfigParser, NoSectionError, NoOptionError
40+
import MySQLdb
41+
42+
__version__ = (1, 0, 0)
43+
ver = "Starting APEL dbloader %s.%s.%s" % __version__
44+
LOG_BREAK = '====================='
45+
46+
47+
def delete_stale_records(cp, args):
48+
"""
49+
Deletes stale records from the configured MySQL table based on UpdateTime.
50+
51+
Records older than (MAX(UpdateTime) - timeframe) are eligible for deletion,
52+
but only if the number of recent records (within timeframe) meets the threshold.
53+
54+
Parameters:
55+
cp: Parsed configuration object.
56+
args: Parsed command-line arguments.
57+
"""
58+
conn = None
59+
cursor = None
60+
61+
try:
62+
# Extract database details from the "*.cfg" file
63+
db_config = {
64+
'backend': cp.get('db', 'backend'),
65+
'host': cp.get('db', 'hostname'),
66+
'port': cp.getint('db', 'port'),
67+
'user': cp.get('db', 'username'),
68+
'password': cp.get('db', 'password'),
69+
'database': cp.get('db', 'name'),
70+
'table_name': cp.get('db', 'table_name'),
71+
'timeframe': cp.getint('common', 'timeframe'),
72+
'threshold': cp.getint('common', 'threshold'),
73+
}
74+
75+
# Ensure no required string is empty
76+
for key in ['backend', 'host', 'user', 'password', 'database', 'table_name']:
77+
if not db_config[key].strip():
78+
raise ValueError(f"'{key}' in config is empty")
79+
80+
# Numeric validations
81+
if db_config['timeframe'] <= 0:
82+
raise ValueError("'timeframe' must be > 0")
83+
if db_config['threshold'] <= 0:
84+
raise ValueError("'threshold' must be > 0")
85+
86+
except (NoSectionError, NoOptionError) as e:
87+
print(f"Configuration error: {e}")
88+
sys.exit(1)
89+
except ValueError as e:
90+
print(f"Invalid configuration value: {e}")
91+
sys.exit(1)
92+
93+
try:
94+
conn = MySQLdb.connect(
95+
host=db_config['host'],
96+
port=db_config['port'],
97+
user=db_config['user'],
98+
passwd=db_config['password'],
99+
db=db_config['database']
100+
)
101+
cursor = conn.cursor()
102+
103+
# Verify UpdateTime column exists
104+
cursor.execute(
105+
f"SHOW COLUMNS FROM {db_config['table_name']} LIKE 'UpdateTime'"
106+
)
107+
108+
if cursor.fetchone() is None:
109+
column_not_found_error = (
110+
f"'UpdateTime' column not found in table "
111+
f"'{db_config['table_name']}'. Aborting operation."
112+
)
113+
114+
if args.dry_run:
115+
print(column_not_found_error)
116+
else:
117+
log.error(column_not_found_error)
118+
119+
return
120+
121+
# Get the latest UpdateTime value
122+
cursor.execute(
123+
f"SELECT MAX(UpdateTime) FROM {db_config['table_name']}"
124+
)
125+
result = cursor.fetchone()
126+
127+
if not result or not result[0]:
128+
no_records_found_error = "No UpdateTime values found. Nothing to purge."
129+
130+
if args.dry_run:
131+
print(no_records_found_error)
132+
else:
133+
log.error(no_records_found_error)
134+
135+
return
136+
137+
max_update = result[0]
138+
cutoff_time = max_update - timedelta(hours=db_config['timeframe'])
139+
cutoff_str = cutoff_time.strftime('%Y-%m-%d %H:%M:%S')
140+
141+
cursor.execute(
142+
f"SELECT COUNT(*) FROM {db_config['table_name']} WHERE UpdateTime >= %s",
143+
(cutoff_str,)
144+
)
145+
preserved_count = cursor.fetchone()[0]
146+
deletable_count = 0
147+
148+
summary = (
149+
f"[SUMMARY] Table: {db_config['table_name']} | "
150+
f"Timeframe: {db_config['timeframe']}h | "
151+
f"Threshold: {db_config['threshold']} | "
152+
f"Preserved: {preserved_count} | "
153+
f"Deletable: {deletable_count} | "
154+
f"Cutoff: {cutoff_str}"
155+
)
156+
157+
if preserved_count < db_config['threshold']:
158+
records_below_threshold_error = (
159+
f"Preserved records ({preserved_count}) below threshold "
160+
f"({db_config['threshold']}). Skipping deletion."
161+
)
162+
163+
if args.dry_run:
164+
print(f"{summary} | Action: DRY_RUN")
165+
print(records_below_threshold_error)
166+
else:
167+
log.info(f"{summary} | Action: ABORT")
168+
log.warning(records_below_threshold_error)
169+
else:
170+
cursor.execute(
171+
f"SELECT COUNT(*) FROM {db_config['table_name']} WHERE UpdateTime < %s",
172+
(cutoff_str,)
173+
)
174+
deletable_count = cursor.fetchone()[0]
175+
176+
records_deleted_msg = (
177+
f"{deletable_count} rows deleted successfully from the"
178+
f"'{db_config['table_name']}' (UpdateTime < {cutoff_str})."
179+
)
180+
181+
if not args.dry_run:
182+
cursor.execute(
183+
f"DELETE FROM {db_config['table_name']} WHERE UpdateTime < %s",
184+
(cutoff_str,)
185+
)
186+
deleted_rows = cursor.rowcount
187+
conn.commit()
188+
189+
log.info(f"{summary} | Action: SUCCESS")
190+
log.info(
191+
f"{deletable_count} rows deleted successfully from the"
192+
f"'{db_config['table_name']}' (UpdateTime < {cutoff_str})."
193+
)
194+
else:
195+
print(f"{summary} | Action: DRY_RUN")
196+
print(
197+
f"DRY_RUN: {deletable_count} rows would be deleted from the"
198+
f"'{db_config['table_name']}' (UpdateTime < {cutoff_str})."
199+
)
200+
201+
except MySQLdb.Error as err:
202+
log.error(f"MySQL error: {err}")
203+
if conn:
204+
conn.rollback()
205+
log.warning("Transaction rolled back due to error.")
206+
finally:
207+
if cursor:
208+
cursor.close()
209+
if conn:
210+
conn.close()
211+
212+
213+
if __name__ == '__main__':
214+
# Default config and log file paths
215+
default_config_path = '/etc/apel/delete_stale_records.cfg'
216+
default_logfile_path = '/var/log/apel/delete_stale_records.log'
217+
218+
# Parse CLI arguments
219+
parser = ArgumentParser(description='Delete stale records from APEL DB.')
220+
parser.add_argument(
221+
'-d', '--db',
222+
help='Location of DB config file',
223+
default=default_config_path
224+
)
225+
parser.add_argument(
226+
'-l', '--log_config',
227+
help='Location of logging config file',
228+
default=None
229+
)
230+
parser.add_argument(
231+
'--dry_run',
232+
action='store_true',
233+
help='Preview deletions without executing'
234+
)
235+
args = parser.parse_args()
236+
237+
# Load configuration file
238+
cp = ConfigParser()
239+
read_files = cp.read(args.db)
240+
if not read_files:
241+
print(f"Error: Failed to read config file: {args.db}")
242+
sys.exit(1)
243+
244+
# Ensure log directory exists
245+
logfile_path = args.log_config or cp.get('logging', 'logfile', fallback=default_logfile_path)
246+
log_dir = os.path.dirname(logfile_path)
247+
if not os.path.isdir(log_dir):
248+
print(f"Error: Log directory does not exist: {log_dir}")
249+
sys.exit(1)
250+
251+
logging.basicConfig(
252+
filename=logfile_path,
253+
level=logging.INFO,
254+
format='%(asctime)s - %(levelname)s - %(message)s'
255+
)
256+
log = logging.getLogger(__name__)
257+
258+
if args.dry_run:
259+
print(f"{LOG_BREAK}\nStarting APEL DB Purge Script\n")
260+
else:
261+
log.info(f"{LOG_BREAK}\nStarting APEL DB Purge Script\n")
262+
263+
delete_stale_records(cp, args)

0 commit comments

Comments
 (0)