Browse Source

initial release

ali 4 years ago
parent
commit
9625e5e123

+ 156 - 0
.gitignore

@@ -0,0 +1,156 @@
+
+# Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+doc/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+
+# IPython
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# profiling data
+.prof
+
+# End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks

+ 1 - 0
MANIFEST.in

@@ -0,0 +1 @@
+include data/*

+ 29 - 1
README.md

@@ -1,3 +1,31 @@
 # doc2python
 
-Native python tool for text extraction from doc binary file.
+Tool that extract text data from doc file.
+
+## How to install it ?
+
+
+Clone the current repository :
+```
+    git clone https://gogs.alibellamine.me/alibell/doc2python
+```
+
+Install dependencies with pip.
+
+```
+    pip install -r requirements.txt
+```
+
+Then install the library :
+
+```
+    pip install -e .
+```
+
+## How to use it
+
+```
+    from doc2python import process
+
+    text = process(path_to_doc)
+```

+ 768 - 0
data_creation/Create_Dict.ipynb

@@ -0,0 +1,768 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from collections import OrderedDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fib = OrderedDict({\n",
+    "    \"wIdent\":[\"An unsigned integer that specifies that this is a Word Binary File. This value MUST be 0xA5EC\",2],\n",
+    "    \"nFib\":[\"An unsigned integer that specifies the version number of the file format used. Superseded by FibRgCswNew.nFibNewif it is present. This value SHOULD<12>be 0x00C1.\", 2],\n",
+    "    \"ununsed\":[\"This value is undefined and MUST be ignore\", 2],\n",
+    "    \"lid\":[\"A LIDthat specifies the install language of the application that is producing the document. If nFibis 0x00D9 or greater, then any East Asian install lid or any install lid with a base language of Spanish, German or French MUST be recorded as lidAmerican. If the nFibis 0x0101 or greater, then any install lid with a base language of Vietnamese, Thai, or Hindi MUST be recorded as lidAmerican.\",2],\n",
+    "    \"pnNext\":[\"An unsigned integer that specifies the offset in the WordDocument streamof the FIBfor the document which contains all the AutoTextitems. If this value is 0, there are no AutoText items attached. Otherwise the FIBis found at file location pnNext×512. If fGlsyis 1 or fDotis 0, this value MUST be 0. If pnNextis not 0, each FIBMUST share the same values for FibRgFcLcb97.fcPlcBteChpx, FibRgFcLcb97.lcbPlcBteChpx, FibRgFcLcb97.fcPlcBtePapx, FibRgFcLcb97.lcbPlcBtePapx, andFibRgLw97.cbMac.\", 2],\n",
+    "    \"flag1\":[\"Flag\", 2, \"bits\", [\"fDot\",\"fGlsy\",\"fComplex\",\"fHasPic\",\"cQuickSaves\",\"fEncrypted\",\"fWhichTblStm\",\"fReadOnlyRecommended\",\"fWriteReservation\",\"fExtChar\",\"fLoadOverride\",\"fFarEast\",\"fObfuscated\"], [1,1,1,1,4,1,1,1,1,1,1,1,1]],\n",
+    "    \"nFibBack\":[\"This value SHOULD<14>be 0x00BF. This value MUST be 0x00BF or 0x00C\", 2],\n",
+    "    \"lKey\":[\" If fEncryptedis 1 and fObfuscationis 1, this value specifies the XOR obfuscation (section 2.2.6.1) password verifier. If fEncryptedis 1 and fObfuscationis 0, this value specifies the size of the EncryptionHeaderthat is stored at the beginning of the Table stream as described in Encryption and Obfuscation. Otherwise, this value MUST be 0\", 4],\n",
+    "    \"envr\":[\"This value MUST be 0, and MUST be ignored\", 1],\n",
+    "    \"flag2\":[\"Flag\", 1, \"bits\", [\"fMac\",\"fEmptySpecial\",\"fLoadOverridePage\",\"reserved1\",\"reserved2\",\"fSpare0\"], [1,1,1,1,1,3]],\n",
+    "    \"reserved3\":[\"This value MUST be 0 and MUST be ignore\", 2],\n",
+    "    \"reserved4\":[\"This value MUST be 0 and MUST be ignore\", 2],\n",
+    "    \"reserved5\":[\"This value is undefined and MUST be ignore\", 4],\n",
+    "    \"reserved6\":[\"This value is undefined and MUST be ignore\", 4]\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../doc2pythondata/FibBase.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(fib))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clw = {\n",
+    "    \"clw\":[\"\",2]\n",
+    "}\n",
+    "\n",
+    "with open(\"../doc2python/data/clw.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(clw))\n",
+    "    \n",
+    "cslw = {\n",
+    "    \"cslw\":[\"\",2]\n",
+    "}\n",
+    "\n",
+    "with open(\"../doc2python/data/cslw.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(cslw))\n",
+    "    \n",
+    "cbRgFcLcb = {\n",
+    "    \"cbRgFcLcb\":[\"\",2]\n",
+    "}\n",
+    "\n",
+    "with open(\"../doc2python/data/cbRgFcLcb.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(cbRgFcLcb))\n",
+    "\n",
+    "cswNew = {\n",
+    "    \"cswNew\":[\"\",2]\n",
+    "}\n",
+    "\n",
+    "with open(\"../doc2python/data/cswNew.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(cswNew))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FibRgW97"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FibRgW97 = OrderedDict({\n",
+    "    \"reserved1\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved2\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved3\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved4\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved5\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved6\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved7\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved8\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved9\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved10\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved11\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved12\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"reserved13\":[\"This value is undefined and MUST be ignored.\", 2],\n",
+    "    \"lidFE\":[\"A LID whose meaning depends on the nFib value, which is one of the following.\", 2]\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../doc2python/data/FibRgW97.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(FibRgW97))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FibRgLw97"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FibRgLw97_list = [\"cbMac\",\n",
+    "    \"reserved1\",\n",
+    "    \"reserved2\",\n",
+    "    \"ccpText\",\n",
+    "    \"ccpFtn\",\n",
+    "    \"ccpHdd\",\n",
+    "    \"reserved3\",\n",
+    "    \"ccpAtn\",\n",
+    "    \"ccpEdn\",\n",
+    "    \"ccpTxbx\",\n",
+    "    \"ccpHdrTxbx\",\n",
+    "    \"reserved4\",\n",
+    "    \"reserved5\",\n",
+    "    \"reserved6\",\n",
+    "    \"reserved7\",\n",
+    "    \"reserved8\",\n",
+    "    \"reserved9\",\n",
+    "    \"reserved10\",\n",
+    "    \"reserved11\",\n",
+    "    \"reserved12\",\n",
+    "    \"reserved13\",\n",
+    "    \"reserved14\"\n",
+    "]\n",
+    "\n",
+    "FibRgLw97 = OrderedDict(zip(FibRgLw97_list, [[\"\",4] for y in range(len(FibRgLw97_list))]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../doc2python/data/FibRgLw97.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(FibRgLw97))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FibRgFcLcb97"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "744"
+      ]
+     },
+     "execution_count": 119,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FibRgFcLcb97_list = [\n",
+    "    \"fcStshfOrig\",\n",
+    "    \"lcbStshfOrig\",\n",
+    "    \"fcStshf\",\n",
+    "    \"lcbStshf\",\n",
+    "    \"fcPlcffndRef\",\n",
+    "    \"lcbPlcffndRef\",\n",
+    "    \"fcPlcffndTxt\",\n",
+    "    \"lcbPlcffndTxt\",\n",
+    "    \"fcPlcfandRef\",\n",
+    "    \"lcbPlcfandRef\",\n",
+    "    \"fcPlcfandTxt\",\n",
+    "    \"lcbPlcfandTxt\",\n",
+    "    \"fcPlcfSed\",\n",
+    "    \"lcbPlcfSed\",\n",
+    "    \"fcPlcPad\",\n",
+    "    \"lcbPlcPad\",\n",
+    "    \"fcPlcfPhe\",\n",
+    "    \"lcbPlcfPhe\",\n",
+    "    \"fcSttbfGlsy\",\n",
+    "    \"lcbSttbfGlsy\",\n",
+    "    \"fcPlcfGlsy\",\n",
+    "    \"lcbPlcfGlsy\",\n",
+    "    \"fcPlcfHdd\",\n",
+    "    \"lcbPlcfHdd\",\n",
+    "    \"fcPlcfBteChpx\",\n",
+    "    \"lcbPlcfBteChpx\",\n",
+    "    \"fcPlcfBtePapx\",\n",
+    "    \"lcbPlcfBtePapx\",\n",
+    "    \"fcPlcfSea\",\n",
+    "    \"lcbPlcfSea\",\n",
+    "    \"fcSttbfFfn\",\n",
+    "    \"lcbSttbfFfn\",\n",
+    "    \"fcPlcfFldMom\",\n",
+    "    \"lcbPlcfFldMom\",\n",
+    "    \"fcPlcfFldHdr\",\n",
+    "    \"lcbPlcfFldHdr\",\n",
+    "    \"fcPlcfFldFtn\",\n",
+    "    \"lcbPlcfFldFtn\",\n",
+    "    \"fcPlcfFldAtn\",\n",
+    "    \"lcbPlcfFldAtn\",\n",
+    "    \"fcPlcfFldMcr\",\n",
+    "    \"lcbPlcfFldMcr\",\n",
+    "    \"fcSttbfBkmk\",\n",
+    "    \"lcbSttbfBkmk\",\n",
+    "    \"fcPlcfBkf\",\n",
+    "    \"lcbPlcfBkf\",\n",
+    "    \"fcPlcfBkl\",\n",
+    "    \"lcbPlcfBkl\",\n",
+    "    \"fcCmds\",\n",
+    "    \"lcbCmds\",\n",
+    "    \"fcUnused1_old_old\",\n",
+    "    \"lcbUnused1_old_old\",\n",
+    "    \"fcSttbfMcr\",\n",
+    "    \"lcbSttbfMcr\",\n",
+    "    \"fcPrDrvr\",\n",
+    "    \"lcbPrDrvr\",\n",
+    "    \"fcPrEnvPort\",\n",
+    "    \"lcbPrEnvPort\",\n",
+    "    \"fcPrEnvLand\",\n",
+    "    \"lcbPrEnvLand\",\n",
+    "    \"fcWss\",\n",
+    "    \"lcbWss\",\n",
+    "    \"fcDop\",\n",
+    "    \"lcbDop\",\n",
+    "    \"fcSttbfAssoc\",\n",
+    "    \"lcbSttbfAssoc\",\n",
+    "    \"fcClx\",\n",
+    "    \"lcbClx\",\n",
+    "    \"fcPlcfPgdFtn\",\n",
+    "    \"lcbPlcfPgdFtn\",\n",
+    "    \"fcAutosaveSource\",\n",
+    "    \"lcbAutosaveSource\",\n",
+    "    \"fcGrpXstAtnOwners\",\n",
+    "    \"lcbGrpXstAtnOwners\",\n",
+    "    \"fcSttbfAtnBkmk\",\n",
+    "    \"lcbSttbfAtnBkmk\",\n",
+    "    \"fcUnused2_old_old\",\n",
+    "    \"lcbUnused2_old_old\",\n",
+    "    \"fcUnused3_old\",\n",
+    "    \"lcbUnused3_old\",\n",
+    "    \"fcPlcSpaMom\",\n",
+    "    \"lcbPlcSpaMom\",\n",
+    "    \"fcPlcSpaHdr\",\n",
+    "    \"lcbPlcSpaHdr\",\n",
+    "    \"fcPlcfAtnBkf\",\n",
+    "    \"lcbPlcfAtnBkf\",\n",
+    "    \"fcPlcfAtnBkl\",\n",
+    "    \"lcbPlcfAtnBkl\",\n",
+    "    \"fcPms\",\n",
+    "    \"lcbPms\",\n",
+    "    \"fcFormFldSttbs\",\n",
+    "    \"lcbFormFldSttbs\",\n",
+    "    \"fcPlcfendRef\",\n",
+    "    \"lcbPlcfendRef\",\n",
+    "    \"fcPlcfendTxt\",\n",
+    "    \"lcbPlcfendTxt\",\n",
+    "    \"fcPlcfFldEdn\",\n",
+    "    \"lcbPlcfFldEdn\",\n",
+    "    \"fcUnused4_old\",\n",
+    "    \"lcbUnused4_old\",\n",
+    "    \"fcDggInfo\",\n",
+    "    \"lcbDggInfo\",\n",
+    "    \"fcSttbfRMark\",\n",
+    "    \"lcbSttbfRMark\",\n",
+    "    \"fcSttbfCaption\",\n",
+    "    \"lcbSttbfCaption\",\n",
+    "    \"fcSttbfAutoCaption\",\n",
+    "    \"lcbSttbfAutoCaption\",\n",
+    "    \"fcPlcfWkb\",\n",
+    "    \"lcbPlcfWkb\",\n",
+    "    \"fcPlcfSpl\",\n",
+    "    \"lcbPlcfSpl\",\n",
+    "    \"fcPlcftxbxTxt\",\n",
+    "    \"lcbPlcftxbxTxt\",\n",
+    "    \"fcPlcfFldTxbx\",\n",
+    "    \"lcbPlcfFldTxbx\",\n",
+    "    \"fcPlcfHdrtxbxTxt\",\n",
+    "    \"lcbPlcfHdrtxbxTxt\",\n",
+    "    \"fcPlcffldHdrTxbx\",\n",
+    "    \"lcbPlcffldHdrTxbx\",\n",
+    "    \"fcStwUser\",\n",
+    "    \"lcbStwUser\",\n",
+    "    \"fcSttbTtmbd\",\n",
+    "    \"lcbSttbTtmbd\",\n",
+    "    \"fcCookieData\",\n",
+    "    \"lcbCookieData\",\n",
+    "    \"fcPgdMotherOldOld\",\n",
+    "    \"lcbPgdMotherOldOld\",\n",
+    "    \"fcBkdMotherOldOld\",\n",
+    "    \"lcbBkdMotherOldOld\",\n",
+    "    \"fcPgdFtnOldOld\",\n",
+    "    \"lcbPgdFtnOldOld\",\n",
+    "    \"fcBkdFtnOldOld\",\n",
+    "    \"lcbBkdFtnOldOld\",\n",
+    "    \"fcPgdEdnOldOld\",\n",
+    "    \"lcbPgdEdnOldOld\",\n",
+    "    \"fcBkdEdnOldOld\",\n",
+    "    \"lcbBkdEdnOldOld\",\n",
+    "    \"fcSttbfIntlFld\",\n",
+    "    \"lcbSttbfIntlFld\",\n",
+    "    \"fcRouteSlip\",\n",
+    "    \"lcbRouteSlip\",\n",
+    "    \"fcSttbSavedBy\",\n",
+    "    \"lcbSttbSavedBy\",\n",
+    "    \"fcSttbFnm\",\n",
+    "    \"lcbSttbFnm\",\n",
+    "    \"fcPlfLst\",\n",
+    "    \"lcbPlfLst\",\n",
+    "    \"fcPlfLfo\",\n",
+    "    \"lcbPlfLfo\",\n",
+    "    \"fcPlcfTxbxBkd\",\n",
+    "    \"lcbPlcfTxbxBkd\",\n",
+    "    \"fcPlcfTxbxHdrBkd\",\n",
+    "    \"lcbPlcfTxbxHdrBkd\",\n",
+    "    \"fcDocUndoWord9\",\n",
+    "    \"lcbDocUndoWord9\",\n",
+    "    \"fcRgbUse\",\n",
+    "    \"lcbRgbUse\",\n",
+    "    \"fcUsp\",\n",
+    "    \"lcbUsp\",\n",
+    "    \"fcUskf\",\n",
+    "    \"lcbUskf\",\n",
+    "    \"fcPlcupcRgbUse\",\n",
+    "    \"lcbPlcupcRgbUse\",\n",
+    "    \"fcPlcupcUsp\",\n",
+    "    \"lcbPlcupcUsp\",\n",
+    "    \"fcSttbGlsyStyle\",\n",
+    "    \"lcbSttbGlsyStyle\",\n",
+    "    \"fcPlgosl\",\n",
+    "    \"lcbPlgosl\",\n",
+    "    \"fcPlcocx\",\n",
+    "    \"lcbPlcocx\",\n",
+    "    \"fcPlcfBteLvc\",\n",
+    "    \"lcbPlcfBteLvc\",\n",
+    "    \"dwLowDateTime\",\n",
+    "    \"dwHighDateTime\",\n",
+    "    \"fcPlcfLvcPre10\",\n",
+    "    \"lcbPlcfLvcPre10\",\n",
+    "    \"fcPlcfAsumy\",\n",
+    "    \"lcbPlcfAsumy\",\n",
+    "    \"fcPlcfGram\",\n",
+    "    \"lcbPlcfGram\",\n",
+    "    \"fcSttbListNames\",\n",
+    "    \"lcbSttbListNames\",\n",
+    "    \"fcSttbfUssr\",\n",
+    "    \"lcbSttbfUssr\"]\n",
+    "len(FibRgFcLcb97_list)*4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "120\n",
+      "864\n"
+     ]
+    }
+   ],
+   "source": [
+    "FibRgFcLcb00_list = [\n",
+    "    \"fcPlcfTch\",\n",
+    "    \"lcbPlcfTch\",\n",
+    "    \"fcRmdThreading\",\n",
+    "    \"lcbRmdThreading\",\n",
+    "    \"fcMid\",\n",
+    "    \"lcbMid\",\n",
+    "    \"fcSttbRgtplc\",\n",
+    "    \"lcbSttbRgtplc\",\n",
+    "    \"fcMsoEnvelope\",\n",
+    "    \"lcbMsoEnvelope\",\n",
+    "    \"fcPlcfLad\",\n",
+    "    \"lcbPlcfLad\",\n",
+    "    \"fcRgDofr\",\n",
+    "    \"lcbRgDofr\",\n",
+    "    \"fcPlcosl\",\n",
+    "    \"lcbPlcosl\",\n",
+    "    \"fcPlcfCookieOld\",\n",
+    "    \"lcbPlcfCookieOld\",\n",
+    "    \"fcPgdMotherOld\",\n",
+    "    \"lcbPgdMotherOld\",\n",
+    "    \"fcBkdMotherOld\",\n",
+    "    \"lcbBkdMotherOld\",\n",
+    "    \"fcPgdFtnOld\",\n",
+    "    \"lcbPgdFtnOld\",\n",
+    "    \"fcBkdFtnOld\",\n",
+    "    \"lcbBkdFtnOld\",\n",
+    "    \"fcPgdEdnOld\",\n",
+    "    \"lcbPgdEdnOld\",\n",
+    "    \"fcBkdEdnOld\",\n",
+    "    \"lcbBkdEdnOld\"\n",
+    "]\n",
+    "print(len(FibRgFcLcb00_list)*4)\n",
+    "print(len(FibRgFcLcb97_list)*4+len(FibRgFcLcb00_list)*4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "224\n",
+      "1088\n"
+     ]
+    }
+   ],
+   "source": [
+    "FibRgFcLcb02_list =[\n",
+    "  \"fcUnused1_old\",\n",
+    "    \"lcbUnused1_old\",\n",
+    "    \"fcPlcfPgp\",\n",
+    "    \"lcbPlcfPgp\",\n",
+    "    \"fcPlcfuim\",\n",
+    "    \"lcbPlcfuim\",\n",
+    "    \"fcPlfguidUim\",\n",
+    "    \"lcbPlfguidUim\",\n",
+    "    \"fcAtrdExtra\",\n",
+    "    \"lcbAtrdExtra\",\n",
+    "    \"fcPlrsid\",\n",
+    "    \"lcbPlrsid\",\n",
+    "    \"fcSttbfBkmkFactoid\",\n",
+    "    \"lcbSttbfBkmkFactoid\",\n",
+    "    \"fcPlcfBkfFactoid\",\n",
+    "    \"lcbPlcfBkfFactoid\",\n",
+    "    \"fcPlcfcookie\",\n",
+    "    \"lcbPlcfcookie\",\n",
+    "    \"fcPlcfBklFactoid\",\n",
+    "    \"lcbPlcfBklFactoid\",\n",
+    "    \"fcFactoidData\",\n",
+    "    \"lcbFactoidData\",\n",
+    "    \"fcDocUndo\",\n",
+    "    \"lcbDocUndo\",\n",
+    "    \"fcSttbfBkmkFcc\",\n",
+    "    \"lcbSttbfBkmkFcc\",\n",
+    "    \"fcPlcfBkfFcc\",\n",
+    "    \"lcbPlcfBkfFcc\",\n",
+    "    \"fcPlcfBklFcc\",\n",
+    "    \"lcbPlcfBklFcc\",\n",
+    "    \"fcSttbfbkmkBPRepairs\",\n",
+    "    \"lcbSttbfbkmkBPRepairs\",\n",
+    "    \"fcPlcfbkfBPRepairs\",\n",
+    "    \"lcbPlcfbkfBPRepairs\",\n",
+    "    \"fcPlcfbklBPRepairs\",\n",
+    "    \"lcbPlcfbklBPRepairs\",\n",
+    "    \"fcPmsNew\",\n",
+    "    \"lcbPmsNew\",\n",
+    "    \"fcODSO\",\n",
+    "    \"lcbODSO\",\n",
+    "    \"fcPlcfpmiOldXP\",\n",
+    "    \"lcbPlcfpmiOldXP\",\n",
+    "    \"fcPlcfpmiNewXP\",\n",
+    "    \"lcbPlcfpmiNewXP\",\n",
+    "    \"fcPlcfpmiMixedXP\",\n",
+    "    \"lcbPlcfpmiMixedXP\",\n",
+    "    \"fcUnused2_old\",\n",
+    "    \"lcbUnused2_old\",\n",
+    "    \"fcPlcffactoid\",\n",
+    "    \"lcbPlcffactoid\",\n",
+    "    \"fcPlcflvcOldXP\",\n",
+    "    \"lcbPlcflvcOldXP\",\n",
+    "    \"fcPlcflvcNewXP\",\n",
+    "    \"lcbPlcflvcNewXP\",\n",
+    "    \"fcPlcflvcMixedXP\",\n",
+    "    \"lcbPlcflvcMixedXP\"\n",
+    "] \n",
+    "\n",
+    "print(len(FibRgFcLcb02_list)*4)\n",
+    "print(len(FibRgFcLcb97_list)*4+len(FibRgFcLcb00_list)*4+len(FibRgFcLcb02_list)*4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "224"
+      ]
+     },
+     "execution_count": 122,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FibRgFcLcb03_list = [\"fcHplxsdr\",\n",
+    "    \"lcbHplxsdr\",\n",
+    "    \"fcSttbfBkmkSdt\",\n",
+    "    \"lcbSttbfBkmkSdt\",\n",
+    "    \"fcPlcfBkfSdt\",\n",
+    "    \"lcbPlcfBkfSdt\",\n",
+    "    \"fcPlcfBklSdt\",\n",
+    "    \"lcbPlcfBklSdt\",\n",
+    "    \"fcCustomXForm\",\n",
+    "    \"lcbCustomXForm\",\n",
+    "    \"fcSttbfBkmkProt\",\n",
+    "    \"lcbSttbfBkmkProt\",\n",
+    "    \"fcPlcfBkfProt\",\n",
+    "    \"lcbPlcfBkfProt\",\n",
+    "    \"fcPlcfBklProt\",\n",
+    "    \"lcbPlcfBklProt\",\n",
+    "    \"fcSttbProtUser\",\n",
+    "    \"lcbSttbProtUser\",\n",
+    "    \"fcUnused\",\n",
+    "    \"lcbUnused\",\n",
+    "    \"fcPlcfpmiOld\",\n",
+    "    \"lcbPlcfpmiOld\",\n",
+    "    \"fcPlcfpmiOldInline\",\n",
+    "    \"lcbPlcfpmiOldInline\",\n",
+    "    \"fcPlcfpmiNew\",\n",
+    "    \"lcbPlcfpmiNew\",\n",
+    "    \"fcPlcfpmiNewInline\",\n",
+    "    \"lcbPlcfpmiNewInline\",\n",
+    "    \"fcPlcflvcOld\",\n",
+    "    \"lcbPlcflvcOld\",\n",
+    "    \"fcPlcflvcOldInline\",\n",
+    "    \"lcbPlcflvcOldInline\",\n",
+    "    \"fcPlcflvcNew\",\n",
+    "    \"lcbPlcflvcNew\",\n",
+    "    \"fcPlcflvcNewInline\",\n",
+    "    \"lcbPlcflvcNewInline\",\n",
+    "    \"fcPgdMother\",\n",
+    "    \"lcbPgdMother\",\n",
+    "    \"fcBkdMother\",\n",
+    "    \"lcbBkdMother\",\n",
+    "    \"fcAfdMother\",\n",
+    "    \"lcbAfdMother\",\n",
+    "    \"fcPgdFtn\",\n",
+    "    \"lcbPgdFtn\",\n",
+    "    \"fcBkdFtn\",\n",
+    "    \"lcbBkdFtn\",\n",
+    "    \"fcAfdFtn\",\n",
+    "    \"lcbAfdFtn\",\n",
+    "    \"fcPgdEdn\",\n",
+    "    \"lcbPgdEdn\",\n",
+    "    \"fcBkdEdn\",\n",
+    "    \"lcbBkdEdn\",\n",
+    "    \"fcAfdEdn\",\n",
+    "    \"lcbAfdEdn\",\n",
+    "    \"fcAfd\",\n",
+    "    \"lcbAfd\"\n",
+    "]\n",
+    "len(FibRgFcLcb03_list)*4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "152\n"
+     ]
+    }
+   ],
+   "source": [
+    "FibRgFcLcb07_list = [\n",
+    "    \"fcPlcfmthd\",\n",
+    "    \"lcbPlcfmthd\",\n",
+    "    \"fcSttbfBkmkMoveFrom\",\n",
+    "    \"lcbSttbfBkmkMoveFrom\",\n",
+    "    \"fcPlcfBkfMoveFrom\",\n",
+    "    \"lcbPlcfBkfMoveFrom\",\n",
+    "    \"fcPlcfBklMoveFrom\",\n",
+    "    \"lcbPlcfBklMoveFrom\",\n",
+    "    \"fcSttbfBkmkMoveTo\",\n",
+    "    \"lcbSttbfBkmkMoveTo\",\n",
+    "    \"fcPlcfBkfMoveTo\",\n",
+    "    \"lcbPlcfBkfMoveTo\",\n",
+    "    \"fcPlcfBklMoveTo\",\n",
+    "    \"lcbPlcfBklMoveTo\",\n",
+    "    \"fcUnused1\",\n",
+    "    \"lcbUnused1\",\n",
+    "    \"fcUnused2\",\n",
+    "    \"lcbUnused2\",\n",
+    "    \"fcUnused3\",\n",
+    "    \"lcbUnused3\",\n",
+    "    \"fcSttbfBkmkArto\",\n",
+    "    \"lcbSttbfBkmkArto\",\n",
+    "    \"fcPlcfBkfArto\",\n",
+    "    \"lcbPlcfBkfArto\",\n",
+    "    \"fcPlcfBklArto\",\n",
+    "    \"lcbPlcfBklArto\",\n",
+    "    \"fcArtoData\",\n",
+    "    \"lcbArtoData\",\n",
+    "    \"fcUnused4\",\n",
+    "    \"lcbUnused4\",\n",
+    "    \"fcUnused5\",\n",
+    "    \"lcbUnused5\",\n",
+    "    \"fcUnused6\",\n",
+    "    \"lcbUnused6\",\n",
+    "    \"fcOssTheme\",\n",
+    "    \"lcbOssTheme\",\n",
+    "    \"fcColorSchemeMapping\",\n",
+    "    \"lcbColorSchemeMapping\"\n",
+    "]\n",
+    "print(len(FibRgFcLcb07_list)*4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 129,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FibRgFcLcb_list = FibRgFcLcb97_list+FibRgFcLcb00_list+FibRgFcLcb02_list+FibRgFcLcb03_list+FibRgFcLcb07_list\n",
+    "\n",
+    "FibRgFcLcb = OrderedDict(zip(FibRgFcLcb_list, [[\"\",4] for y in range(len(FibRgFcLcb_list))]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../doc2python/data/FibRgFcLcb.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(FibRgFcLcb))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FibRgCswNew\n",
+    "FibRgCswNew = OrderedDict({\n",
+    "    \"nFibNew\":[\"An unsigned integer that specifies the version number of the file format that is used. This value MUST be one of the following : 0x00D9, 0x0101, 0x010C, 0x0112\",2],\n",
+    "    \"rgCswNewData\":[\"Depending on the value of nFibNew this is one of the following : 0x00D9, 0x0101, 0x010C\", 2],\n",
+    "    \"rgCswNewData_extend\":[\"Extension of rgCswNewData if nFibNew is 0x0112, you have to reconstitute rgCswNewData in that case\", 6]\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../doc2python/data/FibRgCswNew.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(FibRgCswNew))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pcd = OrderedDict({\n",
+    "    \"flag1\":[\"\",2, \"bits\", [\"fNoParaLast\",\"fR1\",\"fDirty\", \"fR2\"], [1,1,1,13]],\n",
+    "    \"fc\":[\"An FcCompressed structure that specifies the location of the text in the WordDocument Stream.\", 4],\n",
+    "    \"prm\":[\"A Prm structure that specifies additional properties for this text. These properties are used as part of the algorithms in sections 2.4.6.1 (Direct Paragraph Formatting) and 2.4.6.2 (Direct Character Formatting).\", 2]\n",
+    "})\n",
+    "\n",
+    "with open(\"../doc2python/data/pcd.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(pcd))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fc = OrderedDict({\n",
+    "    \"flag1\":[\"\",4, \"bits\", [\"fc\",\"fCompressed\",\"r1\"], [30,1,1]]})\n",
+    "\n",
+    "with open(\"../doc2python/data/fc.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(fc))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 325 - 0
doc2python/__init__.py

@@ -0,0 +1,325 @@
+#
+# doc2python script
+# Extract text from doc file
+#
+
+# Dependencies
+import os
+import json
+import compoundfiles
+import re
+from collections import OrderedDict
+
+# For relative path management
+package_directory = os.path.dirname(os.path.abspath(__file__))
+
+# Functions
+
+def bits_from_bytes(bytes_data, collapse = True):
+    
+    '''
+        bits_from_bytes : get bits value from bytes
+        Param :
+            bytes_data : bytes value
+            collapse : if false, the output is an array with the bits of each byte
+    '''
+    
+    bits_data = []
+    
+    if type(bytes_data) == type(int()):
+        bytes_data = [bytes_data]
+        
+    for byte in bytes_data:
+        bit = bin(byte)[2:].rjust(8, '0')[::-1]
+        bits_data.append(bit)
+        
+    if collapse:
+        bits_data = "".join(bits_data)
+        
+    return(bits_data)
+
+class bytesParser():
+    '''
+        bytesParser
+            Parse byte string
+            Get hexadecimal and decimal value for each
+    '''
+    def __init__(self):
+        # Getting parser offsets from json files
+        self._offset_names = ["FibBase","clw", "FibRgW97","cslw","FibRgFcLcb","cbRgFcLcb","FibRgLw97","cswNew", "FibRgCswNew", "pcd", "fc"]
+        
+        self._offsets_dict = {}
+
+        for offset in self._offset_names:
+            with open(package_directory+"/data/"+offset+".json", "r") as file:
+                json_data = json.loads(file.read(), object_pairs_hook=OrderedDict)
+                self._offsets_dict[offset] = json_data
+                
+    def _process_byte (self, byte):
+        
+        '''
+            Get decimal and hexadecimal from a byte.
+            Input :
+                byte : byte value
+            Output :
+                dict containing byte, decimal and hexadecimal value
+        '''
+        
+        processed_byte = {}
+        processed_byte["bytes"] = byte
+        processed_byte["decimal"] = int.from_bytes(byte, 'little')
+        processed_byte["hexadecimal"] = hex(processed_byte["decimal"])
+        
+        return(processed_byte)
+            
+    def parse(self, data, data_type):
+        '''
+            Parse string
+            input :
+                - data, str : binary string to parse
+                - data_type, str : name of the binary string (ex : Fib...)
+            output :
+                - dict of parsed data 
+        '''
+        
+        if (data_type in self._offset_names):
+            offset_table = self._offsets_dict[data_type]
+            data_values = {} # Contains extracted value
+            
+            cursor = 0
+            for offset in offset_table.keys():
+                delta = int(offset_table[offset][1])
+                
+                if(len(offset_table[offset]) == 2):
+                    data_values[offset] = self._process_byte(data[cursor:cursor+delta])
+                    data_values[offset]["len"] = delta
+                elif (offset_table[offset][2] == 'bits'):
+                    bits_data = bits_from_bytes(data[cursor:cursor+delta])
+                    var_name = offset_table[offset][3]
+                    var_size = offset_table[offset][4]
+                    
+                    bits_dict = {}
+                    cursor_bits = 0
+                    for i in range(len(var_name)):
+                        bits_dict[var_name[i]] = {}
+                        bits_dict[var_name[i]]["bit"] = bits_data[cursor_bits:cursor_bits+var_size[i]][::-1]
+                        bits_dict[var_name[i]]["numeric"] = int(bits_dict[var_name[i]]["bit"], 2)
+                        bits_dict[var_name[i]]["length"] = var_size[i]
+                        
+                        cursor_bits += var_size[i]
+                    
+                    data_values.update(
+                        bits_dict
+                    )
+                else:
+                    raise Exception("Unknown data type")
+                
+                cursor += delta
+                
+            return(data_values)
+        else:
+            raise Exception("Unknown data type")
+            
+    def parseFib(self, wd_data):
+        '''
+            Parse Fib : header of Word Document
+            input :
+                - data, str : WordDocument data
+            output :
+                - dict of parsed data 
+        '''
+        
+        fib_data = {} # Dict for output data
+        
+        cursor = 0
+        # If 0 : need special rule
+        steps = OrderedDict({
+            "FibBase":32,
+            "clw":2,
+            "FibRgW97":28,
+            "cslw":2,
+            "FibRgLw97":88,
+            "cbRgFcLcb":2,
+            "FibRgFcLcb":0,
+            "cswNew":2,
+            "FibRgCswNew":0
+        })
+        
+        for step in steps.keys():
+            if (step == 'FibRgFcLcb'):
+                delta = fib_data["cbRgFcLcb"]["cbRgFcLcb"]["decimal"]*8
+            elif (step == 'FibRgCswNew'):
+                delta = fib_data["cswNew"]["cswNew"]["decimal"]*2
+            else:
+                delta = steps[step]
+            
+            data = wd_data[cursor:cursor+delta]
+            fib_data[step] = self.parse(data, step)
+
+            cursor += delta # Move cursor
+        
+        # Post treatment
+        
+        ## Reconstitute FibRgCswNew
+        if(fib_data["FibRgCswNew"]["nFibNew"]["decimal"] == 274):
+            fib_data["FibRgCswNew"]["rgCswNewData"] = self._process_byte(fib_data["FibRgCswNew"]["nFibNew"]["bytes"]+fib_data["FibRgCswNew"]["rgCswNewData_extend"]["bytes"])
+            fib_data["FibRgCswNew"]["rgCswNewData"]["len"] = 8         
+            
+        return fib_data
+    
+    def parsePlcPcd (self, table_PlcPcd_binary):
+        '''
+            Parse FlcPcd : PlcPcd from Table data
+            input :
+                - data, str : Table data
+            output :
+                - dict of parsed data 
+        '''
+        
+        #Calculating number of CPs
+        nb_cp = (len(table_PlcPcd_binary)+8)/12
+        nb_apcd = nb_cp-1
+        
+        plcPcd_data = {
+            "cp":[],
+            "apcd":[]
+        }
+        
+        cursor = 0
+        
+        # Parsing CPs
+        for i in range(int(nb_cp)):
+            delta = 4
+            plcPcd_data["cp"].append(
+                self._process_byte(table_PlcPcd_binary[cursor:cursor+delta])
+            )
+            
+            cursor = cursor+delta
+            
+        #Parsing aPCDs
+        for i in range(int(nb_apcd)):
+            delta = 8
+            plcPcd_data["apcd"].append(
+                self.parse(table_PlcPcd_binary[cursor:cursor+delta], "pcd")
+            )
+            
+            # Parsing fc
+            plcPcd_data["apcd"][-1]["fc"] = self.parse(plcPcd_data["apcd"][-1]["fc"]["bytes"], "fc")
+            
+            cursor = cursor+delta
+            
+        return(plcPcd_data)
+            
+    def parsePcdt (self, table_pcdt_binary):
+        '''
+            Parse Pcdt : PlcPcd from pcdt data
+            input :
+                - data, str : Pcdt data
+            output :
+                - dict of parsed data 
+        '''
+        
+        pcdt_data = {}
+        
+        steps = {
+            "clxt":1,
+            "lcb":4,
+            "PlcPcd":0
+        }
+        cursor = 0
+        
+        for step in steps:
+            if step == "PlcPcd":
+                delta = len(table_pcdt_binary)-cursor
+                pcdt_data[step] = {}
+                pcdt_data[step] = self.parsePlcPcd(table_pcdt_binary[cursor:cursor+delta])
+            else:
+                delta = steps[step]
+                pcdt_data[step] = self._process_byte(table_pcdt_binary[cursor:cursor+delta])
+                
+            pcdt_data[step]["len"] = delta
+            cursor = cursor+delta
+        
+        return(pcdt_data)
+    
+    def parseClx (self, table_clx_binary):
+        '''
+            Parse Clx
+            input :
+                - data, str : Clx data
+            output :
+                - dict of parsed data 
+        '''
+        
+        clx_data = {}
+        
+        # Getting RgPrc et Pcdt : Pcdt start by 0x02 while RgPrc doesn't, so whe search for 0x02
+        for i in range(len(table_clx_binary)):
+            if table_clx_binary[i] == 2:
+                clx_data["RgPrc"] = table_clx_binary[0:i]
+                clx_data["Pcdt"] = table_clx_binary[i:]
+                
+                break
+                
+        # Getting Pcdt data
+        clx_data["Pcdt"] = self.parsePcdt(clx_data["Pcdt"])
+        
+        return(clx_data)
+    
+def process(file, encoding = "latin1"):
+    '''
+        Extract text from doc file.
+        Input :
+            file, str or bytesIO : file path ou bytesIO object for doc file
+        Output :
+            String of doc content
+    '''
+    
+    # Loading parser
+    bp = bytesParser()
+    
+    # Loading compound file data
+    cf_data = compoundfiles.CompoundFileReader(file)
+    
+    # Getting data from cf
+    WordDocument = cf_data.open("WordDocument").read()
+    Table = cf_data.open([x.name for x in cf_data.root if x.name[1:]=='Table'][0]).read() # Fib lies sometimes on Table, so directly get it from cf_data
+
+    # Parsing data
+    FibData = bp.parseFib(WordDocument)
+    
+    # Getting Clx
+    Clx = Table[FibData["FibRgFcLcb"]["fcClx"]["decimal"]:FibData["FibRgFcLcb"]["fcClx"]["decimal"]+FibData["FibRgFcLcb"]["lcbClx"]["decimal"]]
+    Clx_parsed = bp.parseClx(Clx)
+    
+    # Getting Cps and PCDs
+    cp = Clx_parsed["Pcdt"]["PlcPcd"]["cp"]
+    apcd = Clx_parsed["Pcdt"]["PlcPcd"]["apcd"]
+    
+    # Getting text from doc
+    text = []
+    for i in range(len(apcd)):
+        text_dict = {}
+        text_dict["compressed"] = apcd[i]["fc"]['fCompressed']["numeric"]
+        text_dict["start"] = int(apcd[i]["fc"]["fc"]["numeric"]/(1+text_dict["compressed"]))
+        text_dict["end"] = int(text_dict["start"]+(2-text_dict["compressed"])*(cp[i+1]["decimal"]-cp[i]["decimal"]-1))
+        text_dict["content"] = WordDocument[text_dict["start"]:text_dict["end"]]
+        
+        text.append(text_dict)
+    
+    # Return simple text array
+    text_array = [x["content"].decode(encoding, errors = "ignore") for x in text]
+    fulltext = "".join(text_array)
+    
+    # Postprocessing
+    fulltext = re.sub("\\x13","",fulltext)
+    fulltext = re.sub("\r","\r\n",fulltext)
+    fulltext = re.sub('HYP?ERLINK "(.*?)"(?: *\\\\t *".*?")?(?: *\\\\o *".*?")?(?: *\\\\n *".*?")?(?: *\\\\m *".*?")?(?: *\\\\l *".*?")? *\\x14(.*?)\\x15','(\\2) [\\1]',fulltext)
+    fulltext = re.sub("\\x00|\\x01|\\x14|\\x15","",fulltext)
+    fulltext = re.sub('HYP?ERLINK *"(.*?)"','[\\1]',fulltext)
+    fulltext = re.sub('INCLUDEPICTURE *"(.*?)"',"IMG[\\1]",fulltext)
+    fulltext = re.sub('\\\\\* *MERGEFORMA(TINET?)','',fulltext)
+    fulltext = re.sub("\\x07\\x07","\r\n", fulltext)
+    fulltext = re.sub('\\x07','|',fulltext)
+    
+    return(fulltext)

File diff suppressed because it is too large
+ 0 - 0
doc2python/data/FibBase.json


+ 1 - 0
doc2python/data/FibRgCswNew.json

@@ -0,0 +1 @@
+{"nFibNew": ["An unsigned integer that specifies the version number of the file format that is used. This value MUST be one of the following : 0x00D9, 0x0101, 0x010C, 0x0112", 2], "rgCswNewData": ["Depending on the value of nFibNew this is one of the following : 0x00D9, 0x0101, 0x010C", 2], "rgCswNewData_extend": ["Extension of rgCswNewData if nFibNew is 0x0112, you have to reconstitute rgCswNewData in that case", 6]}

File diff suppressed because it is too large
+ 0 - 0
doc2python/data/FibRgFcLcb.json


+ 1 - 0
doc2python/data/FibRgLw97.json

@@ -0,0 +1 @@
+{"cbMac": ["", 4], "reserved1": ["", 4], "reserved2": ["", 4], "ccpText": ["", 4], "ccpFtn": ["", 4], "ccpHdd": ["", 4], "reserved3": ["", 4], "ccpAtn": ["", 4], "ccpEdn": ["", 4], "ccpTxbx": ["", 4], "ccpHdrTxbx": ["", 4], "reserved4": ["", 4], "reserved5": ["", 4], "reserved6": ["", 4], "reserved7": ["", 4], "reserved8": ["", 4], "reserved9": ["", 4], "reserved10": ["", 4], "reserved11": ["", 4], "reserved12": ["", 4], "reserved13": ["", 4], "reserved14": ["", 4]}

+ 1 - 0
doc2python/data/FibRgW97.json

@@ -0,0 +1 @@
+{"reserved1": ["This value is undefined and MUST be ignored.", 2], "reserved2": ["This value is undefined and MUST be ignored.", 2], "reserved3": ["This value is undefined and MUST be ignored.", 2], "reserved4": ["This value is undefined and MUST be ignored.", 2], "reserved5": ["This value is undefined and MUST be ignored.", 2], "reserved6": ["This value is undefined and MUST be ignored.", 2], "reserved7": ["This value is undefined and MUST be ignored.", 2], "reserved8": ["This value is undefined and MUST be ignored.", 2], "reserved9": ["This value is undefined and MUST be ignored.", 2], "reserved10": ["This value is undefined and MUST be ignored.", 2], "reserved11": ["This value is undefined and MUST be ignored.", 2], "reserved12": ["This value is undefined and MUST be ignored.", 2], "reserved13": ["This value is undefined and MUST be ignored.", 2], "lidFE": ["A LID whose meaning depends on the nFib value, which is one of the following.", 2]}

+ 1 - 0
doc2python/data/cbRgFcLcb.json

@@ -0,0 +1 @@
+{"cbRgFcLcb": ["", 2]}

+ 1 - 0
doc2python/data/clw.json

@@ -0,0 +1 @@
+{"clw": ["", 2]}

+ 1 - 0
doc2python/data/cslw.json

@@ -0,0 +1 @@
+{"cslw": ["", 2]}

+ 1 - 0
doc2python/data/cswNew.json

@@ -0,0 +1 @@
+{"cswNew": ["", 2]}

+ 1 - 0
doc2python/data/fc.json

@@ -0,0 +1 @@
+{"flag1": ["", 4, "bits", ["fc", "fCompressed", "r1"], [30, 1, 1]]}

+ 1 - 0
doc2python/data/pcd.json

@@ -0,0 +1 @@
+{"flag1": ["", 2, "bits", ["fNoParaLast", "fR1", "fDirty", "fR2"], [1, 1, 1, 13]], "fc": ["An FcCompressed structure that specifies the location of the text in the WordDocument Stream.", 4], "prm": ["A Prm structure that specifies additional properties for this text. These properties are used as part of the algorithms in sections 2.4.6.1 (Direct Paragraph Formatting) and 2.4.6.2 (Direct Character Formatting).", 2]}

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+compoundfiles

+ 16 - 0
setup.py

@@ -0,0 +1,16 @@
+from setuptools import setup, find_packages
+
+
+setup(name='doc2python', 
+      version='0.0.1',
+      license='',
+      author='Ali BELLAMINE',
+      author_email='contact@alibellamine.me',
+      description='Extract text from doc file.',
+      long_description=open('README.md').read(),
+      include_package_data=True,
+      packages = find_packages(include=["doc2python"]),
+      package_data = {
+          "":["data/*"]
+      }
+)

Some files were not shown because too many files changed in this diff