diff --git a/src/onegov/landsgemeinde/forms/agenda.py b/src/onegov/landsgemeinde/forms/agenda.py index cf6dd0e4f0..c4caa97c1e 100644 --- a/src/onegov/landsgemeinde/forms/agenda.py +++ b/src/onegov/landsgemeinde/forms/agenda.py @@ -1,8 +1,20 @@ from __future__ import annotations -from datetime import datetime +import re +import zipfile +import os +import base64 +import gzip import pytz +from bs4 import BeautifulSoup +from datetime import datetime +from markupsafe import Markup +from io import BytesIO +from pathlib import Path +from tempfile import TemporaryDirectory + +from onegov.form import Form from onegov.form.fields import TagsField from onegov.form.fields import TimeField from onegov.form.fields import UploadField @@ -27,11 +39,11 @@ from wtforms.validators import Optional from wtforms.validators import ValidationError - from typing import Any from typing import TYPE_CHECKING if TYPE_CHECKING: from onegov.landsgemeinde.request import LandsgemeindeRequest + from onegov.landsgemeinde.collections import AgendaItemCollection class AgendaItemForm(NamedFileForm): @@ -202,3 +214,179 @@ def populate_obj(self, obj: AgendaItem) -> None: # type:ignore[override] tz = pytz.timezone('Europe/Zurich') now = datetime.now(tz=tz).time() obj.start_time = now + + +class AgendaItemUploadForm(Form): + + request: LandsgemeindeRequest + + agenda_item_zip = UploadField( + label=_('Agenda Item ZIP'), + fieldset=_('Import'), + validators=[ + WhitelistedMimeType({'application/zip'}), + FileSizeLimit(100 * 1024 * 1024) + ] + ) + + def get_html_dir(self, + temp: TemporaryDirectory[str], + field: UploadField) -> str | None: + + temp_path = Path(temp.name) + zip_content = None + + if isinstance(field.data, dict) and 'data' in field.data: + encoded_data = field.data['data'] + decoded_data = base64.b64decode(encoded_data) + + if decoded_data[:2] == b'\x1f\x8b': + decompressed_data = gzip.decompress(decoded_data) + zip_content = BytesIO(decompressed_data) + else: + zip_content = BytesIO(decoded_data) + + with zipfile.ZipFile( + zip_content, 'r') as zip_ref: # type:ignore + zip_ref.extractall(temp_path) + + html_dir = None + for root, dirs, files in os.walk(temp_path): + if 'html' in dirs: + html_dir = os.path.join(root, 'html') + break + + return html_dir + + def import_agenda_item( + self, collection: AgendaItemCollection) -> AgendaItem: + + temp = TemporaryDirectory() + html_dir = self.get_html_dir(temp, self.agenda_item_zip) + html_path = Path(html_dir) # type:ignore + html_files = sorted( + [f for f in html_path.glob('*.html') + if f.name != 'combined_clean.html'], + key=lambda f: [int(s) if s.isdigit() else -1 + for s in re.findall(r'\d+|\D+', f.name)] + ) + + combined_html = BeautifulSoup( + '', + 'html.parser') + title = '' + + for file_path in html_files: + with open(file_path, encoding='utf-8') as f: + soup = BeautifulSoup(f.read(), 'html.parser') + + all_paragraphs = [] + + if soup.body is None: + continue + textframes = soup.body.find_all('div', recursive=False) + for textframe in textframes[1:] if textframes else []: + for p_tag in textframe.find_all('p'): + p_class = ' '.join(p_tag.get('class', [])) + spans_text = [] + parent = p_tag.find_parent() + + parent_inline_style = parent.get('style', '' + ) if parent else '' + if not parent_inline_style: + p_class = 'table' + + regex = re.compile(r'^_idTextSpan\d+') + for span in p_tag.find_all('span', id=regex): + text = span.get_text().strip() + if text: + spans_text.append(text) + + if spans_text: + all_paragraphs.append({ + 'class': p_class, + 'text': ' '.join(spans_text), + 'is_list_item': 'Aufz-hlung' in p_class + }) + + i = 0 + while i < len(all_paragraphs): + para = all_paragraphs[i] + if combined_html.body is None: + break + + # Check if this is the start of a list + if para['is_list_item']: + ul_element = combined_html.new_tag('ul') + + # Add this and all consecutive list items to the