diff --git a/MANUAL.txt b/MANUAL.txt index 8f9ba8b0e5bd..d394b2f6d2e1 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -273,6 +273,7 @@ header when requesting a document from a URL: - `opml` ([OPML]) - `org` ([Emacs Org mode]) - `pod` (Perl's [Plain Old Documentation]) + - `pptx` ([PowerPoint]) - `ris` ([RIS] bibliography) - `rtf` ([Rich Text Format]) - `rst` ([reStructuredText]) diff --git a/pandoc.cabal b/pandoc.cabal index a5f21dbb26e3..3807f7c35820 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -610,6 +610,7 @@ library Text.Pandoc.Readers.TikiWiki, Text.Pandoc.Readers.Txt2Tags, Text.Pandoc.Readers.Docx, + Text.Pandoc.Readers.Pptx, Text.Pandoc.Readers.ODT, Text.Pandoc.Readers.EPUB, Text.Pandoc.Readers.Muse, @@ -718,6 +719,11 @@ library Text.Pandoc.Readers.Docx.Util, Text.Pandoc.Readers.Docx.Symbols, Text.Pandoc.Readers.Docx.Fields, + Text.Pandoc.Readers.OOXML.Shared, + Text.Pandoc.Readers.Pptx.Parse, + Text.Pandoc.Readers.Pptx.Shapes, + Text.Pandoc.Readers.Pptx.Slides, + Text.Pandoc.Readers.Pptx.SmartArt, Text.Pandoc.Readers.HTML.Parsing, Text.Pandoc.Readers.HTML.Table, Text.Pandoc.Readers.HTML.TagCategories, @@ -854,6 +860,7 @@ test-suite test-pandoc Tests.Readers.RST Tests.Readers.RTF Tests.Readers.Docx + Tests.Readers.Pptx Tests.Readers.ODT Tests.Readers.Txt2Tags Tests.Readers.EPUB diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 12d1c6c95bf1..5f7b891e2322 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -26,6 +26,7 @@ module Text.Pandoc.Readers Reader (..) , readers , readDocx + , readPptx , readODT , readMarkdown , readCommonMark @@ -87,6 +88,7 @@ import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.Creole import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.Docx +import Text.Pandoc.Readers.Pptx import Text.Pandoc.Readers.DokuWiki import Text.Pandoc.Readers.EPUB import Text.Pandoc.Readers.FB2 @@ -157,6 +159,7 @@ readers = [("native" , TextReader readNative) ,("twiki" , TextReader readTWiki) ,("tikiwiki" , TextReader readTikiWiki) ,("docx" , ByteStringReader readDocx) + ,("pptx" , ByteStringReader readPptx) ,("odt" , ByteStringReader readODT) ,("t2t" , TextReader readTxt2Tags) ,("epub" , ByteStringReader readEPUB) diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index 88c1973f9016..b8c51534b6d6 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -24,51 +24,11 @@ module Text.Pandoc.Readers.Docx.Util ( , extractChildren ) where -import qualified Data.Text as T -import Data.Text (Text) -import Text.Pandoc.XML.Light -import qualified Data.Map as M import Data.List (partition) - -type NameSpaces = M.Map Text Text - -elemToNameSpaces :: Element -> NameSpaces -elemToNameSpaces = foldr (\(Attr qn val) -> - case qn of - QName s _ (Just "xmlns") -> M.insert s val - _ -> id) mempty . elAttribs - -elemName :: NameSpaces -> Text -> Text -> QName -elemName ns prefix name = - QName name (M.lookup prefix ns) - (if T.null prefix then Nothing else Just prefix) - -isElem :: NameSpaces -> Text -> Text -> Element -> Bool -isElem ns prefix name element = - let ns' = ns <> elemToNameSpaces element - in qName (elName element) == name && - qURI (elName element) == M.lookup prefix ns' - -findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element -findChildByName ns pref name el = - let ns' = ns <> elemToNameSpaces el - in findChild (elemName ns' pref name) el - -findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] -findChildrenByName ns pref name el = - let ns' = ns <> elemToNameSpaces el - in findChildren (elemName ns' pref name) el - --- | Like 'findChildrenByName', but searches descendants. -findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element -findElementByName ns pref name el = - let ns' = ns <> elemToNameSpaces el - in findElement (elemName ns' pref name) el - -findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text -findAttrByName ns pref name el = - let ns' = ns <> elemToNameSpaces el - in findAttr (elemName ns' pref name) el +import Text.Pandoc.XML.Light +import Text.Pandoc.Readers.OOXML.Shared + (NameSpaces, elemName, isElem, elemToNameSpaces, + findChildByName, findChildrenByName, findElementByName, findAttrByName) -- | Removes child elements that satisfy a given condition. diff --git a/src/Text/Pandoc/Readers/OOXML/Shared.hs b/src/Text/Pandoc/Readers/OOXML/Shared.hs new file mode 100644 index 000000000000..0aa07a736427 --- /dev/null +++ b/src/Text/Pandoc/Readers/OOXML/Shared.hs @@ -0,0 +1,95 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.OOXML.Shared + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Shared utilities for Office Open XML (OOXML) readers (DOCX, PPTX). +Provides common functions for ZIP archive handling, XML parsing, +namespace management, and DrawingML parsing. +-} +module Text.Pandoc.Readers.OOXML.Shared + ( -- * Constants + emusPerInch + , emuToInches + , inchesToEmu + -- * Types + , NameSpaces + , elemName + , elemToNameSpaces + , isElem + , findChildByName + , findChildrenByName + , findElementByName + , findAttrByName + ) where + +import qualified Data.Map as M +import qualified Data.Text as T +import Data.Text (Text) +import Text.Pandoc.XML.Light + +-- | Type alias for namespace mappings +type NameSpaces = M.Map Text Text + +-- | English Metric Units per inch +-- 1 inch = 914400 EMUs (used in OOXML for dimensions) +emusPerInch :: Integer +emusPerInch = 914400 + +-- | Convert EMUs to inches +emuToInches :: Integer -> Double +emuToInches n = fromIntegral n / fromIntegral emusPerInch + +-- | Convert inches to EMUs +inchesToEmu :: Double -> Integer +inchesToEmu n = round (n * fromIntegral emusPerInch) + +-- | Extract namespace declarations from element attributes +elemToNameSpaces :: Element -> NameSpaces +elemToNameSpaces = foldr (\(Attr qn val) -> + case qn of + QName s _ (Just "xmlns") -> M.insert s val + _ -> id) mempty . elAttribs + +-- | Create a qualified name from namespace map, prefix, and local name +elemName :: NameSpaces -> Text -> Text -> QName +elemName ns prefix name = + QName name + (M.lookup prefix ns) + (if T.null prefix then Nothing else Just prefix) + +-- | Check if element matches namespace prefix and local name +isElem :: NameSpaces -> Text -> Text -> Element -> Bool +isElem ns prefix name element = + let ns' = ns <> elemToNameSpaces element + in qName (elName element) == name && + qURI (elName element) == M.lookup prefix ns' + +-- | Find first child element matching namespace and name +findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findChildByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findChild (elemName ns' pref name) el + +-- | Find all children matching namespace and name +findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] +findChildrenByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findChildren (elemName ns' pref name) el + +-- | Find element anywhere in descendants matching namespace and name +findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findElementByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findElement (elemName ns' pref name) el + +-- | Find attribute value by namespace prefix and name +findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text +findAttrByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findAttr (elemName ns' pref name) el diff --git a/src/Text/Pandoc/Readers/Pptx.hs b/src/Text/Pandoc/Readers/Pptx.hs new file mode 100644 index 000000000000..6ffc8ee3cbf2 --- /dev/null +++ b/src/Text/Pandoc/Readers/Pptx.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Pptx + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Conversion of PPTX (PowerPoint) documents to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Pptx (readPptx) where + +import qualified Data.ByteString.Lazy as B +import qualified Data.Text as T +import Codec.Archive.Zip (toArchiveOrFail) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition (Pandoc(..)) +import Text.Pandoc.Error (PandocError(..)) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Readers.Pptx.Parse (archiveToPptx) +import Text.Pandoc.Readers.Pptx.Slides (pptxToOutput) + +-- | Read PPTX file into Pandoc AST +readPptx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc +readPptx opts bytes = + case toArchiveOrFail bytes of + Right archive -> + case archiveToPptx archive of + Right pptx -> do + -- Convert Pptx intermediate to Pandoc AST + (meta, blocks) <- pptxToOutput opts pptx + return $ Pandoc meta blocks + + Left err -> + throwError $ PandocParseError $ + "Failed to parse PPTX: " <> err + + Left err -> + throwError $ PandocParseError $ + "Failed to unpack PPTX archive: " <> T.pack err diff --git a/src/Text/Pandoc/Readers/Pptx/Parse.hs b/src/Text/Pandoc/Readers/Pptx/Parse.hs new file mode 100644 index 000000000000..174c3345fd9a --- /dev/null +++ b/src/Text/Pandoc/Readers/Pptx/Parse.hs @@ -0,0 +1,217 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Pptx.Parse + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Parsing of PPTX archive to intermediate representation. +-} +module Text.Pandoc.Readers.Pptx.Parse + ( Pptx(..) + , PresentationDoc(..) + , PptxSlide(..) + , SlideId(..) + , archiveToPptx + ) where + +import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry) +import qualified Data.ByteString.Lazy as B +import Data.List (find) +import qualified Data.Map as M +import Data.Maybe (mapMaybe) +import qualified Data.Text as T +import qualified Data.Text.Lazy.Encoding as TL +import Data.Text (Text) +import System.FilePath (splitFileName) +import Text.Pandoc.Readers.OOXML.Shared +import Text.Pandoc.XML.Light +import Text.Read (readMaybe) + +-- | Slide identifier +newtype SlideId = SlideId Int deriving (Show, Eq, Ord) + +-- | Complete PPTX document (intermediate representation) +data Pptx = Pptx + { pptxPresentation :: PresentationDoc + , pptxSlides :: [PptxSlide] + , pptxArchive :: Archive + } deriving (Show) + +-- | Individual slide data +data PptxSlide = PptxSlide + { slideId :: SlideId + , slidePath :: FilePath + , slideElement :: Element -- The parsed p:sld element + , slideRels :: [(Text, Text)] -- Slide relationships + } deriving (Show) + +-- | Presentation-level information from presentation.xml +data PresentationDoc = PresentationDoc + { presNameSpaces :: NameSpaces + , presSlideSize :: (Integer, Integer) -- (width, height) in pixels + , presSlideIds :: [(SlideId, Text)] -- (slideId, relationshipId) + } deriving (Show) + +-- | Parse PPTX archive to intermediate representation +archiveToPptx :: Archive -> Either Text Pptx +archiveToPptx archive = do + -- Find and parse presentation.xml + presPath <- getPresentationXmlPath archive + presElem <- loadXMLFromArchive archive presPath + presDoc <- elemToPresentation presElem + + -- Load presentation relationships to resolve slide paths + presRelsPath <- getPresentationRelsPath archive presPath + presRels <- loadRelationships archive presRelsPath + + -- Parse each slide + slides <- mapM (parseSlide archive presRels) (presSlideIds presDoc) + + return $ Pptx presDoc slides archive + +-- | Find presentation.xml via root relationships +getPresentationXmlPath :: Archive -> Either Text FilePath +getPresentationXmlPath archive = do + -- Load _rels/.rels + relsEntry <- maybeToEither "Missing _rels/.rels" $ + findEntryByPath "_rels/.rels" archive + + relsElem <- parseXMLFromEntry relsEntry + + -- The Relationships element has a default namespace, but Relationship children don't use prefix + -- We need to look at all children regardless of namespace + let relElems = onlyElems $ elContent relsElem + + -- Look for relationship containing "officeDocument" in Type attribute + case find isOfficeDocRel relElems of + Nothing -> Left $ "No presentation.xml relationship found. Found " <> + T.pack (show (length relElems)) <> " relationships." + Just rel -> do + target <- maybeToEither "Missing Target attribute" $ + findAttr (unqual "Target") rel + return $ T.unpack target -- Convert Text to FilePath + + where + isOfficeDocRel el = + case findAttr (unqual "Type") el of + -- Must end with "/officeDocument" to avoid matching "/extended-properties" + Just relType -> "/officeDocument" `T.isSuffixOf` relType + Nothing -> False + +-- | Load and parse XML from archive entry +loadXMLFromArchive :: Archive -> FilePath -> Either Text Element +loadXMLFromArchive archive path = do + entry <- maybeToEither ("Entry not found: " <> T.pack path) $ + findEntryByPath path archive + + let xmlBytes = fromEntry entry + parseXMLFromBS xmlBytes + +-- | Parse XML from ByteString +parseXMLFromBS :: B.ByteString -> Either Text Element +parseXMLFromBS = parseXMLElement . TL.decodeUtf8 + +-- | Parse XML from Entry +parseXMLFromEntry :: Entry -> Either Text Element +parseXMLFromEntry = parseXMLFromBS . fromEntry + +-- | Parse presentation.xml element to PresentationDoc +elemToPresentation :: Element -> Either Text PresentationDoc +elemToPresentation presElem = do + let ns = elemToNameSpaces presElem + + -- Extract slide size (with defaults) + let sizeElem = findChildByName ns "p" "sldSz" presElem + (widthEMU, heightEMU) = case sizeElem of + Just el -> + let cx = readAttrInt "cx" el + cy = readAttrInt "cy" el + in (cx, cy) + Nothing -> (9144000, 6858000) -- Default 10" x 7.5" + + -- Convert EMUs to pixels (approximate for metadata) + let width = widthEMU `div` emusPerInch + height = heightEMU `div` emusPerInch + + -- Extract slide ID list (optional - some presentations may have no slides) + let sldIdLstElem = findChildByName ns "p" "sldIdLst" presElem + + slideRefs <- case sldIdLstElem of + Nothing -> return [] -- No slides is valid for templates/masters-only presentations + Just elem -> do + let sldIdElems = findChildren (elemName ns "p" "sldId") elem + mapM (extractSlideRef ns) (zip [1..] sldIdElems) + + return $ PresentationDoc + { presNameSpaces = ns + , presSlideSize = (width, height) + , presSlideIds = slideRefs + } + +-- | Extract slide ID and relationship ID from p:sldId element +extractSlideRef :: NameSpaces -> (Int, Element) -> Either Text (SlideId, Text) +extractSlideRef ns (idx, sldIdElem) = do + relId <- maybeToEither ("Missing r:id in slide " <> T.pack (show idx)) $ + findAttrByName ns "r" "id" sldIdElem + + return (SlideId idx, relId) + +-- | Safe read attribute as Integer (with default of 0) +readAttrInt :: Text -> Element -> Integer +readAttrInt attrName el = + case findAttr (unqual attrName) el of + Just str -> case readMaybe (T.unpack str) of + Just n -> n + Nothing -> 0 + Nothing -> 0 + +-- | Get presentation relationships path +getPresentationRelsPath :: Archive -> FilePath -> Either Text FilePath +getPresentationRelsPath _archive presPath = + -- ppt/presentation.xml → ppt/_rels/presentation.xml.rels + let (dir, file) = splitFileName presPath + relsPath = dir ++ "/_rels/" ++ file ++ ".rels" + in Right relsPath + +-- | Load relationships from .rels file +loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)] +loadRelationships archive relsPath = + case findEntryByPath relsPath archive of + Nothing -> Right [] -- No relationships is OK + Just entry -> do + relsElem <- parseXMLFromEntry entry + let relElems = onlyElems $ elContent relsElem + return $ mapMaybe extractRelationship relElems + where + extractRelationship el = do + relId <- findAttr (unqual "Id") el + target <- findAttr (unqual "Target") el + return (relId, target) + +-- | Parse a single slide +parseSlide :: Archive -> [(Text, Text)] -> (SlideId, Text) -> Either Text PptxSlide +parseSlide archive rels (sid, relId) = do + -- Resolve relationship to get slide path + target <- maybeToEither ("Relationship not found: " <> relId) $ + lookup relId rels + + -- Resolve relative path: ppt/slides/slide1.xml + let slidePath' = "ppt/" <> T.unpack target + + -- Load and parse slide XML + slideElem <- loadXMLFromArchive archive slidePath' + + -- Load slide-specific relationships + slideRelsPath <- getPresentationRelsPath archive slidePath' + slideRels' <- loadRelationships archive slideRelsPath + + return $ PptxSlide sid slidePath' slideElem slideRels' + +-- | Helper: Maybe a -> Either Text a +maybeToEither :: Text -> Maybe a -> Either Text a +maybeToEither err Nothing = Left err +maybeToEither _ (Just x) = Right x diff --git a/src/Text/Pandoc/Readers/Pptx/Shapes.hs b/src/Text/Pandoc/Readers/Pptx/Shapes.hs new file mode 100644 index 000000000000..d316a7b22e86 --- /dev/null +++ b/src/Text/Pandoc/Readers/Pptx/Shapes.hs @@ -0,0 +1,330 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# OPTIONS_GHC -Wno-partial-fields #-} +{- | + Module : Text.Pandoc.Readers.Pptx.Shapes + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Parsing of PPTX shapes (text boxes, images, tables, diagrams). +-} +module Text.Pandoc.Readers.Pptx.Shapes + ( PptxShape(..) + , PptxParagraph(..) + , BulletType(..) + , parseShapes + , parseShape + , shapeToBlocks + , isTitlePlaceholder + , extractDrawingMLText + ) where + +import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry) +import qualified Data.ByteString.Lazy as B +import Data.List (find, groupBy) +import Data.Maybe (mapMaybe) +import qualified Data.Text as T +import Data.Text (Text) +import Text.Read (readMaybe) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import qualified Text.Pandoc.Class.PandocMonad as P +import Text.Pandoc.Definition +import Text.Pandoc.Readers.OOXML.Shared +import Text.Pandoc.Readers.Pptx.SmartArt +import Text.Pandoc.XML.Light + +-- | Paragraph with bullet/numbering information +data PptxParagraph = PptxParagraph + { paraLevel :: Int -- Bullet level (0, 1, 2...) + , paraBullet :: BulletType + , paraText :: Text + } deriving (Show) + +-- | Bullet type +data BulletType + = NoBullet + | Bullet -- Has bullet (character detected or implicit) + | WingdingsBullet -- Detected via Wingdings symbol + deriving (Show, Eq) + +-- | Shape types in PPTX slides +data PptxShape + = PptxTextBox [PptxParagraph] -- Parsed paragraphs with bullet info + | PptxPicture + { picRelId :: Text -- Relationship ID (lazy loading) + , picTitle :: Text + , picAlt :: Text + } + | PptxTable [[Text]] -- Simple text cells for now + | PptxDiagramRef + { dgmDataRelId :: Text -- Relationship to data.xml + , dgmLayoutRelId :: Text -- Relationship to layout.xml + } + | PptxGraphic Text -- Placeholder for other graphics + deriving (Show) + +-- | Parse all shapes from shape tree +parseShapes :: NameSpaces -> Element -> [PptxShape] +parseShapes ns spTreeElem = + let shapeElems = onlyElems $ elContent spTreeElem + -- Merge parent namespaces with element namespaces + ns' = ns <> elemToNameSpaces spTreeElem + in mapMaybe (parseShape ns') shapeElems + +-- | Parse individual shape element +parseShape :: NameSpaces -> Element -> Maybe PptxShape +parseShape ns el + -- Text box: with + | isElem ns "p" "sp" el = + case findChildByName ns "p" "txBody" el of + Just txBody -> + let paras = parseParagraphs ns txBody + in if null paras + then Nothing + else Just $ PptxTextBox paras + Nothing -> Nothing + + -- Picture: + | isElem ns "p" "pic" el = do + nvPicPr <- findChildByName ns "p" "nvPicPr" el + cNvPr <- findChildByName ns "p" "cNvPr" nvPicPr + + let title = maybe "" id $ findAttr (unqual "name") cNvPr + alt = maybe "" id $ findAttr (unqual "descr") cNvPr + + -- Get blip relationship ID + blipFill <- findChildByName ns "p" "blipFill" el + blip <- findChildByName ns "a" "blip" blipFill + relId <- findAttrByName ns "r" "embed" blip + + return $ PptxPicture relId title alt + + -- GraphicFrame: table or diagram + | isElem ns "p" "graphicFrame" el = + case findChildByName ns "a" "graphic" el >>= + findChildByName ns "a" "graphicData" of + Nothing -> Nothing + Just graphicData -> + case findAttr (unqual "uri") graphicData of + Nothing -> Just $ PptxGraphic "no-uri" + Just uri -> + if "table" `T.isInfixOf` uri + then + -- Table + case findChildByName ns "a" "tbl" graphicData of + Just tbl -> + let rows = parseTableRows ns tbl + in Just $ PptxTable rows + Nothing -> Nothing + else if "diagram" `T.isInfixOf` uri + then + -- SmartArt diagram - dgm namespace is declared inline on relIds element + let dgmRelIds = find (\e -> qName (elName e) == "relIds") (elChildren graphicData) + in case dgmRelIds of + Nothing -> Just $ PptxGraphic "diagram-no-relIds" + Just relIdsElem -> + -- Get r:dm and r:lo attributes (r namespace is in parent) + let ns' = ns <> elemToNameSpaces relIdsElem + in case (findAttrByName ns' "r" "dm" relIdsElem, + findAttrByName ns' "r" "lo" relIdsElem) of + (Just dataRelId, Just layoutRelId) -> + Just $ PptxDiagramRef dataRelId layoutRelId + _ -> Just $ PptxGraphic "diagram-missing-rels" + else + -- Other graphic (chart, etc.) + Just $ PptxGraphic ("other: " <> uri) + + -- Skip other shapes for now + | otherwise = Nothing + +-- | Parse table rows (simple text extraction) +parseTableRows :: NameSpaces -> Element -> [[Text]] +parseTableRows ns tblElem = + let trElems = findChildrenByName ns "a" "tr" tblElem + in map (parseTableRow ns) trElems + +parseTableRow :: NameSpaces -> Element -> [Text] +parseTableRow ns trElem = + let tcElems = findChildrenByName ns "a" "tc" trElem + in map extractCellText tcElems + where + extractCellText tcElem = + -- Get text from txBody/a:p/a:r/a:t + case findChildByName ns "a" "txBody" tcElem of + Just txBody -> extractDrawingMLText txBody + Nothing -> "" + +-- | Convert shape to Pandoc blocks +shapeToBlocks :: PandocMonad m => Archive -> [(Text, Text)] -> PptxShape -> m [Block] +shapeToBlocks _archive _rels (PptxTextBox paras) = + return $ paragraphsToBlocks paras +shapeToBlocks archive rels (PptxPicture relId title alt) = do + -- Resolve relationship to get media path + case lookup relId rels of + Nothing -> return [] -- Image not found + Just target -> do + let mediaPath = resolveMediaPath target + + -- Load image bytes and add to MediaBag + case loadMediaFromArchive archive mediaPath of + Nothing -> return [] + Just mediaBytes -> do + P.insertMedia (T.unpack mediaPath) Nothing mediaBytes + + let altText = if T.null alt then [] else [Str alt] + return [Para [Image nullAttr altText (mediaPath, title)]] + +shapeToBlocks _archive _rels (PptxTable rows) = + -- Simple table representation for now + case rows of + [] -> return [] + (headerRow:bodyRows) -> do + let makeCell text = Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str text]] + headerCells = map makeCell headerRow + bodyCells = map (map makeCell) bodyRows + caption = Caption Nothing [] + colSpec = replicate (length headerRow) (AlignDefault, ColWidthDefault) + headerRow' = Row nullAttr headerCells + bodyRows' = map (Row nullAttr) bodyCells + thead = TableHead nullAttr [headerRow'] + tbody = [TableBody nullAttr 0 [] bodyRows'] + tfoot = TableFoot nullAttr [] + return [Table nullAttr caption colSpec thead tbody tfoot] + +shapeToBlocks archive rels (PptxDiagramRef dataRelId layoutRelId) = do + -- Parse SmartArt diagram + case parseDiagram archive rels dataRelId layoutRelId of + Left err -> do + -- Failed to parse diagram, return placeholder + return [Para [Str $ "[Diagram parse error: " <> err <> "]"]] + Right diagram -> + return $ diagramToBlocks diagram +shapeToBlocks _archive _rels (PptxGraphic text) = + -- Placeholder for other graphics (charts, etc.) + return [Para [Str $ "[Graphic: " <> text <> "]"]] + +-- | Resolve media path (handle relative paths) +resolveMediaPath :: Text -> Text +resolveMediaPath target = + if "../media/" `T.isPrefixOf` target + then "ppt/media/" <> T.drop 9 target -- "../media/" = 9 chars + else if "media/" `T.isPrefixOf` target + then "ppt/" <> target + else target + +-- | Load media file from archive +loadMediaFromArchive :: Archive -> Text -> Maybe B.ByteString +loadMediaFromArchive archive path = + case findEntryByPath (T.unpack path) archive of + Just entry -> Just $ fromEntry entry + Nothing -> Nothing + +-- | Parse paragraphs from text box +parseParagraphs :: NameSpaces -> Element -> [PptxParagraph] +parseParagraphs ns txBody = + let pElems = findChildrenByName ns "a" "p" txBody + in map (parseParagraph ns) pElems + +-- | Parse individual paragraph +parseParagraph :: NameSpaces -> Element -> PptxParagraph +parseParagraph ns pElem = + let level = parseBulletLevel ns pElem + bullet = detectBulletType ns pElem + text = extractParagraphText ns pElem + in PptxParagraph level bullet text + +-- | Parse bullet level from paragraph properties +parseBulletLevel :: NameSpaces -> Element -> Int +parseBulletLevel ns pElem = + case findChildByName ns "a" "pPr" pElem >>= + findAttr (unqual "lvl") >>= + (\s -> readMaybe (T.unpack s) :: Maybe Int) of + Just lvl -> lvl + Nothing -> 0 -- Default to level 0 + +-- | Detect bullet type +detectBulletType :: NameSpaces -> Element -> BulletType +detectBulletType ns pElem = + -- Check for explicit + case findChildByName ns "a" "pPr" pElem >>= + findChildByName ns "a" "buChar" of + Just _buCharElem -> Bullet + Nothing -> + -- Check for Wingdings symbol (common in PowerPoint) + if hasWingdingsSymbol ns pElem + then WingdingsBullet + else NoBullet + +-- | Check if paragraph starts with Wingdings symbol +hasWingdingsSymbol :: NameSpaces -> Element -> Bool +hasWingdingsSymbol ns pElem = + let runs = findChildrenByName ns "a" "r" pElem + checkRun r = case findChildByName ns "a" "rPr" r >>= + findChildByName ns "a" "sym" of + Just symElem -> + case findAttr (unqual "typeface") symElem of + Just typeface -> "Wingdings" `T.isInfixOf` typeface + Nothing -> False + Nothing -> False + in any checkRun runs + +-- | Extract text from paragraph +extractParagraphText :: NameSpaces -> Element -> Text +extractParagraphText _ns pElem = + -- Find all elements and concatenate + let textElems = filterElementsName (\qn -> qName qn == "t") pElem + texts = map strContent textElems + in T.unwords $ filter (not . T.null) texts + +-- | Extract text from DrawingML element (finds all descendants) +extractDrawingMLText :: Element -> Text +extractDrawingMLText el = + let textElems = filterElementsName (\qn -> qName qn == "t") el + texts = map strContent textElems + in T.unwords $ filter (not . T.null) texts + +-- | Convert paragraphs to blocks, grouping bullets into lists +paragraphsToBlocks :: [PptxParagraph] -> [Block] +paragraphsToBlocks paras = + -- If we have multiple paragraphs with bullets, group them + let hasBullets = any (\p -> paraBullet p /= NoBullet) paras + in if hasBullets + then groupBulletParagraphs paras + else map (\p -> Para [Str $ paraText p]) paras + +-- | Group bullet paragraphs into lists +groupBulletParagraphs :: [PptxParagraph] -> [Block] +groupBulletParagraphs paras = + let grouped = groupBy sameBulletLevel paras + in concatMap groupToBlock grouped + where + sameBulletLevel p1 p2 = + (paraBullet p1 /= NoBullet) && + (paraBullet p2 /= NoBullet) && + (paraLevel p1 == paraLevel p2) + + groupToBlock :: [PptxParagraph] -> [Block] + groupToBlock [] = [] + groupToBlock ps@(p:_) + | paraBullet p /= NoBullet = + -- Bullet list + let items = map (\para -> [Plain [Str $ paraText para]]) ps + in [BulletList items] + | otherwise = + -- Plain paragraph + map (\para -> Para [Str $ paraText para]) ps + +-- | Check if shape is title placeholder (also used in Slides module) +isTitlePlaceholder :: NameSpaces -> Element -> Bool +isTitlePlaceholder ns el = + case findChildByName ns "p" "nvSpPr" el >>= + findChildByName ns "p" "nvPr" >>= + findChildByName ns "p" "ph" of + Just phElem -> + case findAttr (unqual "type") phElem of + Just phType -> phType == "title" || phType == "ctrTitle" + Nothing -> False + Nothing -> False diff --git a/src/Text/Pandoc/Readers/Pptx/Slides.hs b/src/Text/Pandoc/Readers/Pptx/Slides.hs new file mode 100644 index 000000000000..5e24af26f4a3 --- /dev/null +++ b/src/Text/Pandoc/Readers/Pptx/Slides.hs @@ -0,0 +1,83 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Pptx.Slides + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Conversion of PPTX slides to Pandoc AST blocks. +-} +module Text.Pandoc.Readers.Pptx.Slides + ( pptxToOutput + ) where + +import Codec.Archive.Zip (Archive) +import Data.List (find) +import Data.Maybe (mapMaybe) +import qualified Data.Text as T +import Data.Text (Text) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Readers.OOXML.Shared +import Text.Pandoc.Readers.Pptx.Parse +import Text.Pandoc.Readers.Pptx.Shapes +import Text.Pandoc.XML.Light + +-- | Convert Pptx intermediate representation to Pandoc AST +pptxToOutput :: PandocMonad m => ReaderOptions -> Pptx -> m (Meta, [Block]) +pptxToOutput _opts pptx = do + let slides = pptxSlides pptx + archive = pptxArchive pptx + + -- Convert each slide to blocks + slideBlocks <- concat <$> mapM (slideToBlocks archive) slides + + return (mempty, slideBlocks) + +-- | Convert slide to blocks +slideToBlocks :: PandocMonad m => Archive -> PptxSlide -> m [Block] +slideToBlocks archive slide = do + let SlideId n = slideId slide + slideElem = slideElement slide + rels = slideRels slide + ns = elemToNameSpaces slideElem + + -- Extract title from title placeholder + title = extractSlideTitle ns slideElem + + -- Create header + slideIdent = "slide-" <> T.pack (show n) + headerText = if T.null title + then "Slide " <> T.pack (show n) + else title + header = Header 2 (slideIdent, [], []) [Str headerText] + + -- Parse shapes and convert to blocks + case findChildByName ns "p" "cSld" slideElem >>= + findChildByName ns "p" "spTree" of + Nothing -> return [header] + Just spTree -> do + -- Filter out title placeholder shapes before parsing + let allShapeElems = onlyElems $ elContent spTree + nonTitleShapeElems = filter (not . isTitlePlaceholder ns) allShapeElems + shapes = mapMaybe (parseShape ns) nonTitleShapeElems + shapeBlocks <- concat <$> mapM (shapeToBlocks archive rels) shapes + return $ header : shapeBlocks + +-- | Extract title from title placeholder +extractSlideTitle :: NameSpaces -> Element -> Text +extractSlideTitle ns slideElem = + case findChildByName ns "p" "cSld" slideElem >>= + findChildByName ns "p" "spTree" of + Nothing -> "" + Just spTree -> + -- Find shape with ph type="title" + let shapes = onlyElems $ elContent spTree + titleShape = find (isTitlePlaceholder ns) shapes + in maybe "" extractDrawingMLText titleShape + +-- isTitlePlaceholder is imported from Shapes module diff --git a/src/Text/Pandoc/Readers/Pptx/SmartArt.hs b/src/Text/Pandoc/Readers/Pptx/SmartArt.hs new file mode 100644 index 000000000000..64e4a8649c03 --- /dev/null +++ b/src/Text/Pandoc/Readers/Pptx/SmartArt.hs @@ -0,0 +1,220 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Pptx.SmartArt + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +SmartArt diagram parsing and text extraction for PPTX. +-} +module Text.Pandoc.Readers.Pptx.SmartArt + ( PptxDiagram(..) + , parseDiagram + , diagramToBlocks + ) where + +import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry) +import qualified Data.Map.Strict as M +import Data.Maybe (mapMaybe) +import qualified Data.Text as T +import qualified Data.Text.Lazy.Encoding as TL +import Data.Text (Text) +import Text.Pandoc.Definition +import Text.Pandoc.Readers.OOXML.Shared +import Text.Pandoc.XML.Light + +-- | SmartArt diagram data +data PptxDiagram = PptxDiagram + { diagramType :: Text -- Layout type (chevron, cycle, etc.) + , diagramNodes :: [(Text, [Text])] -- (nodeText, childTexts) + } deriving (Show) + +-- | Parse SmartArt diagram from relationship IDs +parseDiagram :: Archive + -> [(Text, Text)] -- Slide relationships + -> Text -- data relationship ID + -> Text -- layout relationship ID + -> Either Text PptxDiagram +parseDiagram archive rels dataRelId layoutRelId = do + -- Resolve relationships to file paths + dataTarget <- maybeToEither ("Relationship not found: " <> dataRelId) $ + lookup dataRelId rels + layoutTarget <- maybeToEither ("Relationship not found: " <> layoutRelId) $ + lookup layoutRelId rels + + -- Resolve relative paths (diagrams are in ../diagrams/ from slides/) + let dataPath = resolveDiagramPath dataTarget + layoutPath = resolveDiagramPath layoutTarget + + -- Load XML files + dataElem <- loadXMLFromArchive archive dataPath + layoutElem <- loadXMLFromArchive archive layoutPath + + -- Extract layout type + layoutType <- extractLayoutType layoutElem + + -- Extract text nodes with hierarchy + nodes <- extractDiagramNodes dataElem + + return $ PptxDiagram layoutType nodes + +-- | Resolve diagram path (handle ../diagrams/ relative paths) +resolveDiagramPath :: Text -> FilePath +resolveDiagramPath target = + if "../diagrams/" `T.isPrefixOf` target + then "ppt/diagrams/" ++ T.unpack (T.drop 12 target) -- "../diagrams/" = 12 chars + else T.unpack target + +-- | Load XML from archive +loadXMLFromArchive :: Archive -> FilePath -> Either Text Element +loadXMLFromArchive archive path = + case findEntryByPath path archive of + Nothing -> Left $ "File not found in archive: " <> T.pack path + Just entry -> + let xmlBytes = fromEntry entry + lazyText = TL.decodeUtf8 xmlBytes + in parseXMLElement lazyText + +-- | Extract layout type from layout XML +extractLayoutType :: Element -> Either Text Text +extractLayoutType layoutElem = do + -- Look for uniqueId attribute: "urn:.../layout/chevron2" + case findAttr (unqual "uniqueId") layoutElem of + Just uid -> + -- Extract last part after last / + let layoutName = T.takeWhileEnd (/= '/') uid + in Right layoutName + Nothing -> + -- Fallback: look for title + case findChildByName ns "dgm" "title" layoutElem >>= + findAttr (unqual "val") of + Just title -> Right title + Nothing -> Right "unknown" + where + ns = elemToNameSpaces layoutElem + +-- | Extract text nodes from diagram data +extractDiagramNodes :: Element -> Either Text [(Text, [Text])] +extractDiagramNodes dataElem = do + let ns = elemToNameSpaces dataElem + + -- Find point list + ptLst <- maybeToEither "Missing dgm:ptLst" $ + findChildByName ns "dgm" "ptLst" dataElem + + let ptElems = findChildrenByName ns "dgm" "pt" ptLst + + -- Build node map: modelId → text + let nodeMap = M.fromList $ mapMaybe (extractNodeText ns) ptElems + + -- Parse connections + let cxnLst = findChildByName ns "dgm" "cxnLst" dataElem + connections = maybe [] (parseConnections ns) cxnLst + + -- Build parent-child map + let parentMap = buildParentMap connections + + -- Find parent nodes (nodes that have children) + let parentIds = M.keys parentMap + + -- Build hierarchy - only show nodes that are parents + -- (children are shown under their parents) + let hierarchy = map (buildNodeWithChildren nodeMap parentMap) parentIds + -- Filter out nodes with empty text (presentation nodes) + validHierarchy = filter (\(nodeText, _) -> not $ T.null nodeText) hierarchy + + return validHierarchy + +-- | Extract text from a point element (returns Nothing if no text) +extractNodeText :: NameSpaces -> Element -> Maybe (Text, Text) +extractNodeText ns ptElem = do + modelId <- findAttr (unqual "modelId") ptElem + + -- Extract text from dgm:t element (which contains a:p/a:r/a:t) + let text = case findChildByName ns "dgm" "t" ptElem of + Just tElem -> + -- Recursively get ALL text content from all descendants + getAllText tElem + Nothing -> "" + + -- Only return nodes with actual text + if T.null (T.strip text) + then Nothing + else return (modelId, text) + +-- | Connection between nodes +data Connection = Connection + { connType :: Text + , connSrc :: Text + , connDest :: Text + } deriving (Show) + +-- | Parse connections +parseConnections :: NameSpaces -> Element -> [Connection] +parseConnections ns cxnLst = + let cxnElems = findChildrenByName ns "dgm" "cxn" cxnLst + in mapMaybe (parseConnection ns) cxnElems + +parseConnection :: NameSpaces -> Element -> Maybe Connection +parseConnection _ns cxnElem = do + let cxnType = maybe "" id $ findAttr (unqual "type") cxnElem -- Empty if no type + srcId <- findAttr (unqual "srcId") cxnElem + destId <- findAttr (unqual "destId") cxnElem + return $ Connection cxnType srcId destId + +-- | Build parent-child map from connections +-- Use connections WITHOUT a type attribute (these are the data hierarchy) +buildParentMap :: [Connection] -> M.Map Text [Text] +buildParentMap connections = + let dataConnections = filter (\c -> T.null (connType c)) connections + in foldr addConn M.empty dataConnections + where + addConn conn m = M.insertWith (++) (connSrc conn) [connDest conn] m + +-- | Build node with its children +buildNodeWithChildren :: M.Map Text Text -> M.Map Text [Text] -> Text -> (Text, [Text]) +buildNodeWithChildren nodeMap parentMap nodeId = + let nodeText = M.findWithDefault "" nodeId nodeMap + childIds = M.findWithDefault [] nodeId parentMap + -- Only include children that have text + childTexts = filter (not . T.null) $ + map (\cid -> M.findWithDefault "" cid nodeMap) childIds + in (nodeText, childTexts) + +-- | Convert diagram to Pandoc blocks +diagramToBlocks :: PptxDiagram -> [Block] +diagramToBlocks diagram = + let nodes = diagramNodes diagram + layoutType = diagramType diagram + + -- Build content blocks + contentBlocks = concatMap nodeToBlocks nodes + + in [Div ("", ["smartart", layoutType], [("layout", layoutType)]) + contentBlocks] + +-- | Convert node to blocks +nodeToBlocks :: (Text, [Text]) -> [Block] +nodeToBlocks (nodeText, childTexts) = + if null childTexts + then [Para [Strong [Str nodeText]]] + else [ Para [Strong [Str nodeText]] + , BulletList [[Plain [Str child]] | child <- childTexts] + ] + +-- | Recursively extract all text from an element and its descendants +getAllText :: Element -> Text +getAllText el = + let textFromContent (Text cdata) = cdData cdata + textFromContent (Elem e) = getAllText e + textFromContent _ = "" + texts = map textFromContent (elContent el) + in T.unwords $ filter (not . T.null) texts + +-- Helper functions +maybeToEither :: Text -> Maybe a -> Either Text a +maybeToEither err Nothing = Left err +maybeToEither _ (Just x) = Right x diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs new file mode 100644 index 000000000000..613d5b50fd17 --- /dev/null +++ b/test/Tests/Readers/Pptx.hs @@ -0,0 +1,63 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.Pptx + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Tests for the PPTX reader. +-} +module Tests.Readers.Pptx (tests) where + +import Data.Algorithm.Diff (getDiff) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import qualified Data.Text as T +import Test.Tasty +import Test.Tasty.Golden.Advanced +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.UTF8 as UTF8 + +defopts :: ReaderOptions +defopts = def{ readerExtensions = getDefaultExtensions "pptx" } + +testCompare :: String -> FilePath -> FilePath -> TestTree +testCompare = testCompareWithOpts defopts + +nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String) +nativeDiff normPath expectedNative actualNative + | expectedNative == actualNative = return Nothing + | otherwise = Just <$> do + expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative) + actual <- T.unpack <$> runIOorExplode (writeNative def actualNative) + let dash = replicate 72 '-' + let diff = getDiff (lines actual) (lines expected) + return $ '\n' : dash ++ + "\n--- " ++ normPath ++ + "\n+++ " ++ "test" ++ "\n" ++ + showDiff (1,1) diff ++ dash + +testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree +testCompareWithOpts opts testName pptxFP nativeFP = + goldenTest + testName + (do nf <- UTF8.toText <$> BS.readFile nativeFP + runIOorExplode (readNative def nf)) + (do df <- B.readFile pptxFP + runIOorExplode (readPptx opts df)) + (nativeDiff nativeFP) + (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a) + >>= BS.writeFile nativeFP . UTF8.fromText) + +tests :: [TestTree] +tests = [ testGroup "basic" + [ testCompare + "text extraction" + "pptx-reader/basic.pptx" + "pptx-reader/basic.native" + ] + ] diff --git a/test/pptx-reader/basic.native b/test/pptx-reader/basic.native new file mode 100644 index 000000000000..954cb93458f6 --- /dev/null +++ b/test/pptx-reader/basic.native @@ -0,0 +1,149 @@ +[ Header 2 ( "slide-1" , [] , [] ) [ Str "LLMs" ] +, BulletList + [ [ Plain + [ Str + "Provider \61664 Available LLMs \8211 who manages? How?" + ] + ] + , [ Plain + [ Str + "EW maintained list of \8220approved\8221 LLMs for Universal workers" + ] + ] + , [ Plain + [ Str + "Rebuilding of UWs to the \8220Newgen\8221 thing completely" + ] + ] + , [ Plain [ Str "Streaming support" ] ] + , [ Plain [ Str "Multimodal (voice streaming) models?" ] ] + ] +, Header + 2 + ( "slide-2" , [] , [] ) + [ Str "Everworker venn diagram" ] +, Para [ Str "SKILLS" ] +, Para [ Str "" ] +, Para [ Str "Specialized Workers / Workflows:" ] +, Para [ Str "" ] +, Para [ Str "n8n, UI Path, " ] +, Para [ Str "other RPA" ] +, Para [ Str "BRAINS" ] +, Para [ Str "" ] +, Para [ Str "Universal Workers / AI Agents:" ] +, Para [ Str "" ] +, Para [ Str "openai , anthropic," ] +, Para [ Str "Crew AI, other " ] +, Para [ Str "\8220AI natives\8221" ] +, Para [ Str "KNOWLEDGE " ] +, Para [ Str "" ] +, Para [ Str "Data / " ] +, Para [ Str "RAG Pipelines" ] +, Para [ Str "" ] +, Para + [ Str "Vector DBs, specialized data prep vendors, \8230" ] +, Para [ Str "glean" ] +, Para [ Str "EW" ] +, Header 2 ( "slide-3" , [] , [] ) [ Str "Table" ] +, Table + ( "" , [] , [] ) + (Caption Nothing []) + [ ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + ] + (TableHead + ( "" , [] , [] ) + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col1" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col2" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col3" ] ] + ] + ]) + [ TableBody + ( "" , [] , [] ) + (RowHeadColumns 0) + [] + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Name" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Anton" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Antich" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Age" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "years" ] ] + ] + ] + ] + (TableFoot ( "" , [] , [] ) []) +, Para + [ Image + ( "" , [] , [] ) [] ( "ppt/media/image1.png" , "Picture 6" ) + ] +, Header 2 ( "slide-4" , [] , [] ) [ Str "Smart Art" ] +, Div + ( "" + , [ "smartart" , "chevron2" ] + , [ ( "layout" , "chevron2" ) ] + ) + [ Para [ Strong [ Str "First" ] ] + , BulletList + [ [ Plain [ Str "another" ] ] + , [ Plain [ Str "subtitle" ] ] + ] + , Para [ Strong [ Str "Second" ] ] + , BulletList + [ [ Plain [ Str "and yet again" ] ] + , [ Plain [ Str "yet more" ] ] + ] + ] +] diff --git a/test/pptx-reader/basic.pptx b/test/pptx-reader/basic.pptx new file mode 100644 index 000000000000..44caef9c363a Binary files /dev/null and b/test/pptx-reader/basic.pptx differ diff --git a/test/pptx/basic.native b/test/pptx/basic.native new file mode 100644 index 000000000000..60b5142a95d9 --- /dev/null +++ b/test/pptx/basic.native @@ -0,0 +1,150 @@ +[ Header 2 ( "slide-1" , [] , [] ) [ Str "Slide 1" ] +, Para [ Str "LLMs" ] +, BulletList + [ [ Plain + [ Str + "Provider \61664 Available LLMs \8211 who manages? How?" + ] + ] + , [ Plain + [ Str + "EW maintained list of \8220approved\8221 LLMs for Universal workers" + ] + ] + , [ Plain + [ Str + "Rebuilding of UWs to the \8220Newgen\8221 thing completely" + ] + ] + , [ Plain [ Str "Streaming support" ] ] + , [ Plain [ Str "Multimodal (voice streaming) models?" ] ] + ] +, Header 2 ( "slide-2" , [] , [] ) [ Str "Slide 2" ] +, Para [ Str "Everworker venn diagram" ] +, Para [ Str "SKILLS" ] +, Para [ Str "" ] +, Para [ Str "Specialized Workers / Workflows:" ] +, Para [ Str "" ] +, Para [ Str "n8n, UI Path, " ] +, Para [ Str "other RPA" ] +, Para [ Str "BRAINS" ] +, Para [ Str "" ] +, Para [ Str "Universal Workers / AI Agents:" ] +, Para [ Str "" ] +, Para [ Str "openai , anthropic," ] +, Para [ Str "Crew AI, other " ] +, Para [ Str "\8220AI natives\8221" ] +, Para [ Str "KNOWLEDGE " ] +, Para [ Str "" ] +, Para [ Str "Data / " ] +, Para [ Str "RAG Pipelines" ] +, Para [ Str "" ] +, Para + [ Str "Vector DBs, specialized data prep vendors, \8230" ] +, Para [ Str "glean" ] +, Para [ Str "EW" ] +, Header 2 ( "slide-3" , [] , [] ) [ Str "Slide 3" ] +, Para [ Str "Table" ] +, Table + ( "" , [] , [] ) + (Caption Nothing []) + [ ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + ] + (TableHead + ( "" , [] , [] ) + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col1" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col2" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col3" ] ] + ] + ]) + [ TableBody + ( "" , [] , [] ) + (RowHeadColumns 0) + [] + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Name" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Anton" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Antich" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Age" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "years" ] ] + ] + ] + ] + (TableFoot ( "" , [] , [] ) []) +, Para + [ Image + ( "" , [] , [] ) [] ( "ppt/media/image1.png" , "Picture 6" ) + ] +, Header 2 ( "slide-4" , [] , [] ) [ Str "Slide 4" ] +, Para [ Str "Smart Art" ] +, Div + ( "" + , [ "smartart" , "chevron2" ] + , [ ( "layout" , "chevron2" ) ] + ) + [ Para [ Strong [ Str "First" ] ] + , BulletList + [ [ Plain [ Str "another" ] ] + , [ Plain [ Str "subtitle" ] ] + ] + , Para [ Strong [ Str "Second" ] ] + , BulletList + [ [ Plain [ Str "and yet again" ] ] + , [ Plain [ Str "yet more" ] ] + ] + ] +] diff --git a/test/pptx/basic.pptx b/test/pptx/basic.pptx new file mode 100644 index 000000000000..44caef9c363a Binary files /dev/null and b/test/pptx/basic.pptx differ diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index 80d4ada7f6d7..0d04b361f1c6 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -12,6 +12,7 @@ import qualified Tests.Command import qualified Tests.Old import qualified Tests.Readers.Creole import qualified Tests.Readers.Docx +import qualified Tests.Readers.Pptx import qualified Tests.Readers.DokuWiki import qualified Tests.Readers.EPUB import qualified Tests.Readers.FB2 @@ -95,6 +96,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "RST" Tests.Readers.RST.tests , testGroup "RTF" Tests.Readers.RTF.tests , testGroup "Docx" Tests.Readers.Docx.tests + , testGroup "Pptx" Tests.Readers.Pptx.tests , testGroup "ODT" Tests.Readers.ODT.tests , testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests , testGroup "EPUB" Tests.Readers.EPUB.tests