Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANUAL.txt
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ header when requesting a document from a URL:
- `opml` ([OPML])
- `org` ([Emacs Org mode])
- `pod` (Perl's [Plain Old Documentation])
- `pptx` ([PowerPoint])
- `ris` ([RIS] bibliography)
- `rtf` ([Rich Text Format])
- `rst` ([reStructuredText])
Expand Down
7 changes: 7 additions & 0 deletions pandoc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,7 @@ library
Text.Pandoc.Readers.TikiWiki,
Text.Pandoc.Readers.Txt2Tags,
Text.Pandoc.Readers.Docx,
Text.Pandoc.Readers.Pptx,
Text.Pandoc.Readers.ODT,
Text.Pandoc.Readers.EPUB,
Text.Pandoc.Readers.Muse,
Expand Down Expand Up @@ -718,6 +719,11 @@ library
Text.Pandoc.Readers.Docx.Util,
Text.Pandoc.Readers.Docx.Symbols,
Text.Pandoc.Readers.Docx.Fields,
Text.Pandoc.Readers.OOXML.Shared,
Text.Pandoc.Readers.Pptx.Parse,
Text.Pandoc.Readers.Pptx.Shapes,
Text.Pandoc.Readers.Pptx.Slides,
Text.Pandoc.Readers.Pptx.SmartArt,
Text.Pandoc.Readers.HTML.Parsing,
Text.Pandoc.Readers.HTML.Table,
Text.Pandoc.Readers.HTML.TagCategories,
Expand Down Expand Up @@ -854,6 +860,7 @@ test-suite test-pandoc
Tests.Readers.RST
Tests.Readers.RTF
Tests.Readers.Docx
Tests.Readers.Pptx
Tests.Readers.ODT
Tests.Readers.Txt2Tags
Tests.Readers.EPUB
Expand Down
3 changes: 3 additions & 0 deletions src/Text/Pandoc/Readers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ module Text.Pandoc.Readers
Reader (..)
, readers
, readDocx
, readPptx
, readODT
, readMarkdown
, readCommonMark
Expand Down Expand Up @@ -87,6 +88,7 @@ import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
import Text.Pandoc.Readers.Pptx
import Text.Pandoc.Readers.DokuWiki
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.FB2
Expand Down Expand Up @@ -157,6 +159,7 @@ readers = [("native" , TextReader readNative)
,("twiki" , TextReader readTWiki)
,("tikiwiki" , TextReader readTikiWiki)
,("docx" , ByteStringReader readDocx)
,("pptx" , ByteStringReader readPptx)
,("odt" , ByteStringReader readODT)
,("t2t" , TextReader readTxt2Tags)
,("epub" , ByteStringReader readEPUB)
Expand Down
48 changes: 4 additions & 44 deletions src/Text/Pandoc/Readers/Docx/Util.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,51 +24,11 @@ module Text.Pandoc.Readers.Docx.Util (
, extractChildren
) where

import qualified Data.Text as T
import Data.Text (Text)
import Text.Pandoc.XML.Light
import qualified Data.Map as M
import Data.List (partition)

type NameSpaces = M.Map Text Text

elemToNameSpaces :: Element -> NameSpaces
elemToNameSpaces = foldr (\(Attr qn val) ->
case qn of
QName s _ (Just "xmlns") -> M.insert s val
_ -> id) mempty . elAttribs

elemName :: NameSpaces -> Text -> Text -> QName
elemName ns prefix name =
QName name (M.lookup prefix ns)
(if T.null prefix then Nothing else Just prefix)

isElem :: NameSpaces -> Text -> Text -> Element -> Bool
isElem ns prefix name element =
let ns' = ns <> elemToNameSpaces element
in qName (elName element) == name &&
qURI (elName element) == M.lookup prefix ns'

findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findChildByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChild (elemName ns' pref name) el

findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
findChildrenByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChildren (elemName ns' pref name) el

-- | Like 'findChildrenByName', but searches descendants.
findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findElementByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findElement (elemName ns' pref name) el

findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findAttr (elemName ns' pref name) el
import Text.Pandoc.XML.Light
import Text.Pandoc.Readers.OOXML.Shared
(NameSpaces, elemName, isElem, elemToNameSpaces,
findChildByName, findChildrenByName, findElementByName, findAttrByName)


-- | Removes child elements that satisfy a given condition.
Expand Down
95 changes: 95 additions & 0 deletions src/Text/Pandoc/Readers/OOXML/Shared.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.OOXML.Shared
Copyright : © 2025 Anton Antic
License : GNU GPL, version 2 or above

Maintainer : Anton Antic <[email protected]>
Stability : alpha
Portability : portable

Shared utilities for Office Open XML (OOXML) readers (DOCX, PPTX).
Provides common functions for ZIP archive handling, XML parsing,
namespace management, and DrawingML parsing.
-}
module Text.Pandoc.Readers.OOXML.Shared
( -- * Constants
emusPerInch
, emuToInches
, inchesToEmu
-- * Types
, NameSpaces
, elemName
, elemToNameSpaces
, isElem
, findChildByName
, findChildrenByName
, findElementByName
, findAttrByName
) where

import qualified Data.Map as M
import qualified Data.Text as T
import Data.Text (Text)
import Text.Pandoc.XML.Light

-- | Type alias for namespace mappings
type NameSpaces = M.Map Text Text

-- | English Metric Units per inch
-- 1 inch = 914400 EMUs (used in OOXML for dimensions)
emusPerInch :: Integer
emusPerInch = 914400

-- | Convert EMUs to inches
emuToInches :: Integer -> Double
emuToInches n = fromIntegral n / fromIntegral emusPerInch

-- | Convert inches to EMUs
inchesToEmu :: Double -> Integer
inchesToEmu n = round (n * fromIntegral emusPerInch)

-- | Extract namespace declarations from element attributes
elemToNameSpaces :: Element -> NameSpaces
elemToNameSpaces = foldr (\(Attr qn val) ->
case qn of
QName s _ (Just "xmlns") -> M.insert s val
_ -> id) mempty . elAttribs

-- | Create a qualified name from namespace map, prefix, and local name
elemName :: NameSpaces -> Text -> Text -> QName
elemName ns prefix name =
QName name
(M.lookup prefix ns)
(if T.null prefix then Nothing else Just prefix)

-- | Check if element matches namespace prefix and local name
isElem :: NameSpaces -> Text -> Text -> Element -> Bool
isElem ns prefix name element =
let ns' = ns <> elemToNameSpaces element
in qName (elName element) == name &&
qURI (elName element) == M.lookup prefix ns'

-- | Find first child element matching namespace and name
findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findChildByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChild (elemName ns' pref name) el

-- | Find all children matching namespace and name
findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
findChildrenByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChildren (elemName ns' pref name) el

-- | Find element anywhere in descendants matching namespace and name
findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findElementByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findElement (elemName ns' pref name) el

-- | Find attribute value by namespace prefix and name
findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findAttr (elemName ns' pref name) el
43 changes: 43 additions & 0 deletions src/Text/Pandoc/Readers/Pptx.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.Pptx
Copyright : © 2025 Anton Antic
License : GNU GPL, version 2 or above

Maintainer : Anton Antic <[email protected]>
Stability : alpha
Portability : portable

Conversion of PPTX (PowerPoint) documents to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Pptx (readPptx) where

import qualified Data.ByteString.Lazy as B
import qualified Data.Text as T
import Codec.Archive.Zip (toArchiveOrFail)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Definition (Pandoc(..))
import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Options (ReaderOptions)
import Text.Pandoc.Readers.Pptx.Parse (archiveToPptx)
import Text.Pandoc.Readers.Pptx.Slides (pptxToOutput)

-- | Read PPTX file into Pandoc AST
readPptx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
readPptx opts bytes =
case toArchiveOrFail bytes of
Right archive ->
case archiveToPptx archive of
Right pptx -> do
-- Convert Pptx intermediate to Pandoc AST
(meta, blocks) <- pptxToOutput opts pptx
return $ Pandoc meta blocks

Left err ->
throwError $ PandocParseError $
"Failed to parse PPTX: " <> err

Left err ->
throwError $ PandocParseError $
"Failed to unpack PPTX archive: " <> T.pack err
Loading
Loading