Skip to content

PHPWord::Reader: Missing text runs... #2496

Open
@websuasive

Description

@websuasive

I'm trying to read in a word document that has numerous text runs in it. However, all of the text within them are being lumped together into just a couple of textruns for the entire document.

I'm not sure if this is relevant but the text runs contain mc:AlternateContent sections that contain nested content themselves - such as picts, drawings and, most importantly, textboxes.

This means that all the text is extracted by my app. But 'text' items that should be distinct parts of the document are treated as one.

This is some of the document - I've closed the first few text runs up. All of them are treated as one textrun when parsing file using the PHPWord #Reader.

<w:document mc:Ignorable="w14 w15 wp14">
	<w:body>
		<w:p w:rsidR="00FA35B2" w:rsidRDefault="00040E93">
			<w:pPr></w:pPr>
			<w:bookmarkStart w:id="0" w:name="_GoBack"/>
			<w:bookmarkEnd w:id="0"/>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r></w:r>
			<w:r>
				<w:rPr>
					<w:noProof/>
					<w:lang w:val="en-GB" w:eastAsia="en-GB"/>
				</w:rPr>
				<mc:AlternateContent>
					<mc:Choice Requires="wps">
						<w:drawing>
							<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="251561472" behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1">
								<wp:simplePos x="0" y="0"/>
								<wp:positionH relativeFrom="page">
									<wp:posOffset>2310130</wp:posOffset>
								</wp:positionH>
								<wp:positionV relativeFrom="page">
									<wp:posOffset>6385560</wp:posOffset>
								</wp:positionV>
								<wp:extent cx="4053840" cy="533400"/>
								<wp:effectExtent l="0" t="0" r="0" b="0"/>
								<wp:wrapSquare wrapText="bothSides"/>
								<wp:docPr id="203" name="Text Box 200"/>
								<wp:cNvGraphicFramePr>
									<a:graphicFrameLocks/>
								</wp:cNvGraphicFramePr>
								<a:graphic>
									<a:graphicData uri="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
										<wps:wsp>
											<wps:cNvSpPr txBox="1">
												<a:spLocks noChangeArrowheads="1"/>
											</wps:cNvSpPr>
											<wps:spPr bwMode="auto">
												<a:xfrm>
													<a:off x="0" y="0"/>
													<a:ext cx="4053840" cy="533400"/>
												</a:xfrm>
												<a:prstGeom prst="rect">
													<a:avLst/>
												</a:prstGeom>
												<a:noFill/>
												<a:ln>
													<a:noFill/>
												</a:ln>
												<a:extLst>
													<a:ext uri="{909E8E84-426E-40DD-AFC4-6F175D3DCCD1}">
														<a14:hiddenFill>
															<a:solidFill>
																<a:srgbClr val="FFFFFF"/>
															</a:solidFill>
														</a14:hiddenFill>
													</a:ext>
													<a:ext uri="{91240B29-F687-4F45-9708-019B960494DF}">
														<a14:hiddenLine w="9525">
															<a:solidFill>
																<a:srgbClr val="000000"/>
															</a:solidFill>
															<a:miter lim="800000"/>
															<a:headEnd/>
															<a:tailEnd/>
														</a14:hiddenLine>
													</a:ext>
												</a:extLst>
											</wps:spPr>
											<wps:txbx>
												<w:txbxContent>
													<w:p w:rsidR="00FA35B2" w:rsidRDefault="00040E93">
														<w:pPr>
															<w:spacing w:before="44" w:after="24" w:line="254" w:lineRule="exact"/>
															<w:textAlignment w:val="baseline"/>
															<w:rPr>
																<w:rFonts w:ascii="Arial" w:eastAsia="Arial" w:hAnsi="Arial"/>
																<w:color w:val="000000"/>
																<w:spacing w:val="-1"/>
															</w:rPr>
														</w:pPr>
														<w:r>
															<w:rPr>
																<w:rFonts w:ascii="Arial" w:eastAsia="Arial" w:hAnsi="Arial"/>
																<w:color w:val="000000"/>
																<w:spacing w:val="-1"/>
															</w:rPr>
															<w:t xml:space="preserve">
This role includes a moderate amount of activity and the physical movement of stock, including
</w:t>
														</w:r>
														<w:r>
															<w:rPr>
																<w:rFonts w:ascii="Arial" w:eastAsia="Arial" w:hAnsi="Arial"/>
																<w:color w:val="000000"/>
																<w:spacing w:val="-1"/>
															</w:rPr>
															<w:t>
lifting and moving objects may be required, all necessary health and safety training will be provided.
</w:t>
														</w:r>
													</w:p>
												</w:txbxContent>
											</wps:txbx>
											<wps:bodyPr rot="0" vert="horz" wrap="square" lIns="0" tIns="0" rIns="0" bIns="0" anchor="t" anchorCtr="0" upright="1">
												<a:noAutofit/>
											</wps:bodyPr>
										</wps:wsp>
									</a:graphicData>
								</a:graphic>
								<wp14:sizeRelH relativeFrom="page">
									<wp14:pctWidth>0</wp14:pctWidth>
								</wp14:sizeRelH>
								<wp14:sizeRelV relativeFrom="page">
									<wp14:pctHeight>0</wp14:pctHeight>
								</wp14:sizeRelV>
							</wp:anchor>
						</w:drawing>
					</mc:Choice>
					<mc:Fallback>
						<w:pict>
							<v:shape id="Text Box 200" o:spid="_x0000_s1036" type="#_x0000_t202" style="position:absolute;margin-left:181.9pt;margin-top:502.8pt;width:319.2pt;height:42pt;z-index:-251755008;visibility:visible;mso-wrap-style:square;mso-width-percent:0;mso-height-percent:0;mso-wrap-distance-left:0;mso-wrap-distance-top:0;mso-wrap-distance-right:0;mso-wrap-distance-bottom:0;mso-position-horizontal:absolute;mso-position-horizontal-relative:page;mso-position-vertical:absolute;mso-position-vertical-relative:page;mso-width-percent:0;mso-height-percent:0;mso-width-relative:page;mso-height-relative:page;v-text-anchor:top" o:gfxdata="UEsDBBQABgAIAAAAIQC2gziS/gAAAOEBAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbJSRQU7DMBBF 90jcwfIWJU67QAgl6YK0S0CoHGBkTxKLZGx5TGhvj5O2G0SRWNoz/78nu9wcxkFMGNg6quQqL6RA 0s5Y6ir5vt9lD1JwBDIwOMJKHpHlpr69KfdHjyxSmriSfYz+USnWPY7AufNIadK6MEJMx9ApD/oD OlTrorhX2lFEilmcO2RdNtjC5xDF9pCuTyYBB5bi6bQ4syoJ3g9WQ0ymaiLzg5KdCXlKLjvcW893 SUOqXwnz5DrgnHtJTxOsQfEKIT7DmDSUCaxw7Rqn8787ZsmRM9e2VmPeBN4uqYvTtW7jvijg9N/y JsXecLq0q+WD6m8AAAD//wMAUEsDBBQABgAIAAAAIQA4/SH/1gAAAJQBAAALAAAAX3JlbHMvLnJl bHOkkMFqwzAMhu+DvYPRfXGawxijTi+j0GvpHsDYimMaW0Yy2fr2M4PBMnrbUb/Q94l/f/hMi1qR JVI2sOt6UJgd+ZiDgffL8ekFlFSbvV0oo4EbChzGx4f9GRdb25HMsYhqlCwG5lrLq9biZkxWOiqY 22YiTra2kYMu1l1tQD30/bPm3wwYN0x18gb45AdQl1tp5j/sFB2T0FQ7R0nTNEV3j6o9feQzro1i OWA14Fm+Q8a1a8+Bvu/d/dMb2JY5uiPbhG/ktn4cqGU/er3pcvwCAAD//wMAUEsDBBQABgAIAAAA IQBucdLJtAIAALUFAAAOAAAAZHJzL2Uyb0RvYy54bWysVMlu2zAQvRfoPxC8K1pMO5IQOUgsqyiQ LkDSD6AlyiIqkSpJW06D/nuHlGVnuRRtdSBGnOGb7c1cXR+6Fu2Z0lyKDIcXAUZMlLLiYpvhbw+F F2OkDRUVbaVgGX5kGl8v37+7GvqURbKRbcUUAhCh06HPcGNMn/q+LhvWUX0heyZAWUvVUQO/autX ig6A3rV+FAQLf5Cq6pUsmdZwm49KvHT4dc1K86WuNTOozTDEZtyp3Lmxp7+8oulW0b7h5TEM+hdR dJQLcHqCyqmhaKf4G6iOl0pqWZuLUna+rGteMpcDZBMGr7K5b2jPXC5QHN2fyqT/H2z5ef9VIV5l OApmGAnaQZMe2MGgW3lAUGFboaHXKRje92BqDqCATrtsdX8ny+8aCblqqNiyG6Xk0DBaQYShfek/ ezriaAuyGT7JChzRnZEO6FCrzpYPCoIAHTr1eOqODaaESxLMZzEBVQm6+WxGxuB8mk6ve6XNByY7 ZIUMK+i+Q6f7O21sNDSdTKwzIQveto4BrXhxAYbjDfiGp1Zno3ANfUqCZB2vY+KRaLH2SJDn3k2x It6iCC/n+SxfrfLwl/UbkrThVcWEdTORKyR/1rwjzUdanOilZcsrC2dD0mq7WbUK7SmQu3Cfqzlo zmb+yzBcESCXVymFEQluo8QrFvGlRwoy95LLIPaCMLlNFgFJSF68TOmOC/bvKaEhw8k8mo9kOgf9 KrfAfW9zo2nHDayPlncZjk9GNLUUXIvKtdZQ3o7ys1LY8M+lgHZPjXaEtRwd2WoOm4ObjvA0CBtZ PQKFlQSGARlh94HQSPUTowH2SIb1jx1VDKP2o4AxsEtnEtQkbCaBihKeZthgNIorMy6nXa/4tgHk cdCEvIFRqbljsZ2pMYrjgMFucMkc95hdPs//ndV52y5/AwAA//8DAFBLAwQUAAYACAAAACEAYnFn FeEAAAAOAQAADwAAAGRycy9kb3ducmV2LnhtbEyPwU7DMBBE70j8g7VI3KjdVERtGqeqEJyQEGk4 cHTibWI1XofYbcPf45zobVYzmnmb7ybbswuO3jiSsFwIYEiN04ZaCV/V29MamA+KtOodoYRf9LAr 7u9ylWl3pRIvh9CyWEI+UxK6EIaMc990aJVfuAEpekc3WhXiObZcj+oay23PEyFSbpWhuNCpAV86 bE6Hs5Ww/6by1fx81J/lsTRVtRH0np6kfHyY9ltgAafwH4YZP6JDEZlqdybtWS9hla4ieoiGEM8p sDkiRJIAq2e13qTAi5zfvlH8AQAA//8DAFBLAQItABQABgAIAAAAIQC2gziS/gAAAOEBAAATAAAA AAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0AFAAGAAgAAAAhADj9If/WAAAA lAEAAAsAAAAAAAAAAAAAAAAALwEAAF9yZWxzLy5yZWxzUEsBAi0AFAAGAAgAAAAhAG5x0sm0AgAA tQUAAA4AAAAAAAAAAAAAAAAALgIAAGRycy9lMm9Eb2MueG1sUEsBAi0AFAAGAAgAAAAhAGJxZxXh AAAADgEAAA8AAAAAAAAAAAAAAAAADgUAAGRycy9kb3ducmV2LnhtbFBLBQYAAAAABAAEAPMAAAAc BgAAAAA= " filled="f" stroked="f">
								<v:textbox inset="0,0,0,0">
									<w:txbxContent>
										<w:p w:rsidR="00FA35B2" w:rsidRDefault="00040E93">
											<w:pPr>
												<w:spacing w:before="44" w:after="24" w:line="254" w:lineRule="exact"/>
												<w:textAlignment w:val="baseline"/>
												<w:rPr>
													<w:rFonts w:ascii="Arial" w:eastAsia="Arial" w:hAnsi="Arial"/>
													<w:color w:val="000000"/>
													<w:spacing w:val="-1"/>
												</w:rPr>
											</w:pPr>
											<w:r>
												<w:rPr>
													<w:rFonts w:ascii="Arial" w:eastAsia="Arial" w:hAnsi="Arial"/>
													<w:color w:val="000000"/>
													<w:spacing w:val="-1"/>
												</w:rPr>
												<w:t xml:space="preserve">
This role includes a moderate amount of activity and the physical movement of stock, including
</w:t>
											</w:r>
											<w:r>
												<w:rPr>
													<w:rFonts w:ascii="Arial" w:eastAsia="Arial" w:hAnsi="Arial"/>
													<w:color w:val="000000"/>
													<w:spacing w:val="-1"/>
												</w:rPr>
												<w:t>
lifting and moving objects may be required, all necessary health and safety training will be provided.
</w:t>
											</w:r>
										</w:p>
									</w:txbxContent>
								</v:textbox>
								<w10:wrap type="square" anchorx="page" anchory="page"/>
							</v:shape>
						</w:pict>
					</mc:Fallback>
				</mc:AlternateContent>
			</w:r>
			<w:r></w:r>
			<w:r></w:r>



Steps to Reproduce

Please provide a code sample that reproduces the issue.

<?php

class ParseDocx
{
    private string $path;
    private mixed  $last_element;

    public function __construct(String $path)
    {
        $this->path         = $path;
        $this->last_element = null;
    }

    protected function processElements($element, $level = 0, $in_run = false) {
        Log::debug("PE", ['class' => class_basename($element), 'lvl' => $level, 'run' => $in_run]);
        switch (class_basename($element)) {
            case "TextRun":
                echo "TEXT RUN \n\n";
                foreach ($element->getElements() as $element) {
                    $this->processElements($element,$level+1, true);
                }

                break;
            case 'Text':
                echo $element->getText();

                break;
            case 'TextBreak':
                echo "\n\n";
                break;
            case 'PageBreak':
                echo "\n\n ==== \n\n";
                break;
            // note for this example, I have removed most types as not relevant to this.
            default:
                //
        }
    }

    public function process()
    {

        $objReader = \PhpOffice\PhpWord\IOFactory::createReader('Word2007');
        $phpWord = $objReader->load($this->path);

        foreach ($phpWord->getSections() as $index => $section) {
            foreach ($section->getElements() as $element) {
                $this->processElements($element);
            }
        }
    }

}

If you invoke this code by constructing the object and running process on this file: https://source.data.ox.ac.uk/vacancies/125005/0013254672.docx you will only see a few of the textruns being output compared to what is in the xml file.

Please fill in your environment information:

  • PHP Version: 8.2
  • PHPWord Version: [1.1.0] (2023-05-30)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions