IEWebArchive
============

* :download:`Download example <PyObjCExample-IEWebArchive.zip>`

Reading IE-style web archives (.mht files).


.. rst-class:: tabber

Sources
-------

.. rst-class:: tabbertab

MHTDocument.py
..............

.. sourcecode:: python

    import objc
    from Cocoa import NSDocument
    from loader import MHTLoader
    
    
    class MHTDocument(NSDocument):
        locationbox = objc.IBOutlet()
        webview = objc.IBOutlet()
    
        path = None
        statusText = None
    
        @objc.IBAction
        def navigateHistory_(self, sender):
            if sender.selectedSegment() == 0:
                self.webview.goBack_(sender)
            else:
                self.webview.goForward_(sender)
    
        def windowNibName(self):
            return "MHTDocument"
    
        def readFromFile_ofType_(self, path, tp):
            if self.webview is None:
                self.path = path
            else:
                self.readMHT_(path)
    
            return True
    
        def writeToFile_ofType_(self, path, tp):
            # TODO: "save-as" functionality
            return False
    
        def windowControllerDidLoadNib_(self, controller):
            if self.path:
                self.readMHT_(self.path)
    
        def readMHT_(self, path):
            self.mht = MHTLoader(path)
            self.locationbox.setStringValue_(self.mht.fixupURL(self.mht.root))
            archive = self.mht.asWebArchive()
            print("Archive", archive.description())
            with open("/tmp/archive.webarchive", "wb") as fp:
                fp.write(archive.data().bytes())
            self.webview.mainFrame().stopLoading()
            self.webview.mainFrame().loadArchive_(archive)
            1 / 0

.. rst-class:: tabbertab

loader.py
.........

.. sourcecode:: python

    import email
    
    from Cocoa import NSURL, NSData, NSString
    from WebKit import WebArchive, WebResource
    
    
    # def loadMHT(filename):
    #     """
    #     Load a .HMT HTML archive and return the WebArchive representation.
    #     """
    #     return HMTLoad(filename).asWebArchive()
    
    
    class MHTLoader:
        """
        A loader for .mht files, and archive format used by MS Internet Explorer
        on Windows.
        """
    
        def __init__(self, filename):
            self.filename = filename
    
            # root of the archive (index into self.parts)
            self.root = None
    
            # filename -> (content-type, data)
            self.parts = {}
    
            self.loadFile(filename)
    
        def loadFile(self, filename):
            with open(filename) as fp:
                msg = email.message_from_file(fp)
    
            for part in msg.walk():
                if part.get_content_maintype() == "multipart":
                    continue
    
                filename = part.get("Content-Location")
                contentType = part.get_content_type()
                data = part.get_payload(decode=True)
    
                self.parts[filename] = (contentType, data)
                if self.root is None:
                    self.root = filename
    
        def fixupURL(self, url):
            # IE creates MHT files with file: URLS containing backslashes,
            # NSURL insists that those are invalid, replace backslashes by
            # forward slashes.
            if url.startswith("file:"):
                return url.replace("\\", "/")
            else:
                return url
    
        def asWebArchive(self):
            """
            Convert the MHT archive to a webarchive.
            """
            rootType, rootText = self.parts[self.root]
            pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(  # noqa: B950
                NSData.dataWithBytes_length_(rootText.replace(b"\\", b"/"), len(rootText)),
                NSURL.URLWithString_(self.fixupURL(self.root)),
                NSString.stringWithString_(rootType),
                None,
                None,
            )
    
            resources = []
            for url in self.parts:
                if url == self.root:
                    continue
    
                tp, data = self.parts[url]
                resources.append(
                    WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
                        NSData.dataWithBytes_length_(data, len(data)),
                        NSURL.URLWithString_(self.fixupURL(url)),
                        NSString.stringWithString_(tp),
                        None,
                        None,
                    )
                )
    
            return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_(
                pageResource, resources, None
            )
    
    
    def main():
        # Testing...
        p = MHTLoader("python-home.mht")
        a = p.asWebArchive()
        with open("python-home.webarchive", "wb") as fp:
            fp.write(a.data().bytes())
    
    
    if __name__ == "__main__":
        main()

.. rst-class:: tabbertab

main.py
.......

.. sourcecode:: python

    import MHTDocument  # noqa: F401
    import objc
    from PyObjCTools import AppHelper
    
    objc.setVerbose(1)
    
    AppHelper.runEventLoop()

.. rst-class:: tabbertab

setup.py
........

.. sourcecode:: python

    """
    Script for building the example.
    
    Usage:
        python3 setup.py py2app
    """
    
    from setuptools import setup
    
    plist = {
        "CFBundleDocumentTypes": [
            {
                "CFBundleTypeExtensions": ["mht"],
                "CFBundleTypeName": "Internet Explorer Web Archive",
                "CFBundleTypeRole": "Editor",
                "NSDocumentClass": "MHTDocument",
            }
        ]
    }
    
    setup(
        name="MHTViewer",
        app=["main.py"],
        data_files=["MainMenu.nib", "MHTDocument.nib"],
        options={"py2app": {"plist": plist}},
        setup_requires=["py2app", "pyobjc-framework-Cocoa", "pyobjc-framework-WebKit"],
    )

