Skip to content

Commit

Permalink
Netstandard/itextsharp (#568)
Browse files Browse the repository at this point in the history
* Add separate project for iTextSharp pdf text extractor.
* Add readme and itextsharp dll to the package.
  • Loading branch information
tusmester authored Feb 11, 2019
1 parent a26862f commit a994d07
Show file tree
Hide file tree
Showing 12 changed files with 297 additions and 79 deletions.
85 changes: 11 additions & 74 deletions src/ContentRepository/Search/Indexing/TextExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
using System.IO.Compression;
using System.Text;
using System.Xml;
using iTextSharp.text.pdf;
using SenseNet.ContentRepository.Storage;
using SenseNet.Diagnostics;
using SenseNet.Search;
Expand Down Expand Up @@ -297,12 +296,19 @@ public override string Extract(Stream stream, TextExtractorContext context)
return GetOpenXmlText(stream, context);
}
}
internal sealed class PdfTextExtractor : TextExtractor
public class PdfTextExtractor : TextExtractor
{
private static bool _iFilterErrorLogged;

public override string Extract(Stream stream, TextExtractorContext context)
{
return ExtractiFilter(stream, out _);
}

protected string ExtractiFilter(Stream stream, out bool success)
{
success = true;

try
{
// extract text using IFilter
Expand All @@ -312,9 +318,7 @@ public override string Extract(Stream stream, TextExtractorContext context)
{
SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex,
EventId.Indexing,
properties: new Dictionary<string, object> {{"Stream size", stream.Length}});

return string.Empty;
properties: new Dictionary<string, object> { { "Stream size", stream.Length } });
}
catch (Exception ex)
{
Expand All @@ -326,76 +330,9 @@ public override string Extract(Stream stream, TextExtractorContext context)
}
}

// fallback to the other mechanism in case the pdf IFilter is missing
var text = new StringBuilder();
success = false;

try
{
var pdfReader = new PdfReader(stream);
for (var page = 1; page <= pdfReader.NumberOfPages; page++)
{
// extract text using the old version (4.1.6) of iTextSharp
var pageText = ExtractTextFromPdfBytes(pdfReader.GetPageContent(page));
if (string.IsNullOrEmpty(pageText))
continue;

text.Append(pageText);
}
}
catch (OutOfMemoryException ex)
{
SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex,
EventId.Indexing,
properties: new Dictionary<string, object> {{"Stream size", stream.Length}});
}

return text.ToString();
}

/// <summary>
/// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made).
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
internal static string ExtractTextFromPdfBytes(byte[] input)
{
if (input == null || input.Length == 0)
return "";

var result = new StringBuilder();
var tokeniser = new PRTokeniser(input);

try
{
while (tokeniser.NextToken())
{
var tknType = tokeniser.TokenType;
var tknValue = tokeniser.StringValue.Replace('\0', ' ');

if (tknType == PRTokeniser.TK_STRING)
{
result.Append(tknValue);
}
else
{
switch (tknValue)
{
case "-600":
result.Append(" ");
break;
case "TJ":
result.Append(" ");
break;
}
}
}
}
finally
{
tokeniser.Close();
}

return result.ToString();
return string.Empty;
}
}
internal sealed class XmlTextExtractor : TextExtractor
Expand Down
4 changes: 0 additions & 4 deletions src/ContentRepository/SenseNet.ContentRepository.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,6 @@
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
<Reference Include="itextsharp, Version=4.1.6.0, Culture=neutral, PublicKeyToken=8354ae6d2174ddca, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\References\itextsharp.dll</HintPath>
</Reference>
<Reference Include="Microsoft.CSharp" />
<Reference Include="Microsoft.JScript" />
<Reference Include="Newtonsoft.Json, Version=11.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
Expand Down
19 changes: 19 additions & 0 deletions src/SenseNet.sln
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SenseNet.BlobStorage", "Blo
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SenseNet.Scripting.JScript", "Scripting.JScript\SenseNet.Scripting.JScript.csproj", "{F934DA68-BAF1-4DA4-B09B-B018EF44E8AC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SenseNet.TextExtractors.Pdf", "TextExtractors.Pdf\SenseNet.TextExtractors.Pdf.csproj", "{67EC265E-A47C-4939-AAD3-A405EEC062F9}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -336,6 +338,22 @@ Global
{F934DA68-BAF1-4DA4-B09B-B018EF44E8AC}.Release|x64.Build.0 = Release|Any CPU
{F934DA68-BAF1-4DA4-B09B-B018EF44E8AC}.Release|x86.ActiveCfg = Release|Any CPU
{F934DA68-BAF1-4DA4-B09B-B018EF44E8AC}.Release|x86.Build.0 = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|x64.ActiveCfg = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|x64.Build.0 = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|x86.ActiveCfg = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Debug|x86.Build.0 = Debug|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|Any CPU.ActiveCfg = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|Any CPU.Build.0 = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|Mixed Platforms.Build.0 = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|x64.ActiveCfg = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|x64.Build.0 = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|x86.ActiveCfg = Release|Any CPU
{67EC265E-A47C-4939-AAD3-A405EEC062F9}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -359,6 +377,7 @@ Global
{0CB6D0A8-A477-4382-9F5A-C5BEBD2648AA} = {C68D256D-7D40-4E33-8A2B-B1625538B138}
{C250C071-6ACD-42E0-9FFC-63283AFB8C6C} = {2997D17C-A736-43E5-B3DD-11D11AC7DF17}
{F934DA68-BAF1-4DA4-B09B-B018EF44E8AC} = {2997D17C-A736-43E5-B3DD-11D11AC7DF17}
{67EC265E-A47C-4939-AAD3-A405EEC062F9} = {2997D17C-A736-43E5-B3DD-11D11AC7DF17}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {7D903DEB-CA0B-43D8-BD9D-820BB1453C4C}
Expand Down
1 change: 0 additions & 1 deletion src/Services/SenseNet.Services.nuspec
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
</dependencies>
</metadata>
<files>
<file src="bin\Release\itextsharp.*" target="lib\net461" />
<file src="bin\Release\SenseNet.ContentRepository.*" target="lib\net461" />
<file src="bin\Release\SenseNet.Services.*" target="lib\net461" />
<file src="bin\Release\SenseNet.Storage.*" target="lib\net461" />
Expand Down
24 changes: 24 additions & 0 deletions src/TextExtractors.Pdf/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

#if DEBUG
[assembly: AssemblyTitle("SenseNet.TextExtractors.Pdf (Debug)")]
#else
[assembly: AssemblyTitle("SenseNet.TextExtractors.Pdf (Release)")]
#endif
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("Sense/Net Inc.")]
[assembly: AssemblyCopyright("Copyright © Sense/Net Inc.")]
[assembly: AssemblyProduct("sensenet iTextSharp PDF text extractor")]
[assembly: AssemblyTrademark("Sense/Net Inc.")]
[assembly: AssemblyCulture("")]

[assembly: AssemblyVersion("7.0.0.0")]
[assembly: AssemblyFileVersion("7.0.0.0")]
[assembly: AssemblyInformationalVersion("7.0.0.0")]

[assembly: ComVisible(false)]
[assembly: Guid("67ec265e-a47c-4939-aad3-a405eec062f9")]

17 changes: 17 additions & 0 deletions src/TextExtractors.Pdf/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
## iTextSharp text extractor for sensenet
This is a legacy package containing a PDF text extractor for sensenet, built on the last free version of _iTextSharp_. The package is built on the .Net Framework so cannot be used in a .Net Core environment.

## Usage
Install the following NuGet package:

[![NuGet](https://img.shields.io/nuget/v/SenseNet.TextExtractors.Pdf.svg)](https://www.nuget.org/packages/SenseNet.TextExtractors.Pdf)

To configure the text extractor, please go to the `Indexing` settings in the Content Repository and set the following class for the `pdf` extension.

```json
{
"TextExtractors": {
"pdf": "SenseNet.TextExtractors.Pdf.iTextSharpPdfTextExtractor"
}
}
```
87 changes: 87 additions & 0 deletions src/TextExtractors.Pdf/SenseNet.TextExtractors.Pdf.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{67EC265E-A47C-4939-AAD3-A405EEC062F9}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>SenseNet.TextExtractors.Pdf</RootNamespace>
<AssemblyName>SenseNet.TextExtractors.Pdf</AssemblyName>
<TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<Deterministic>true</Deterministic>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="itextsharp">
<HintPath>..\References\itextsharp.dll</HintPath>
</Reference>
<Reference Include="SenseNet.Tools, Version=3.1.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\SenseNet.Tools.3.1.0\lib\netstandard2.0\SenseNet.Tools.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Configuration" />
<Reference Include="System.Configuration.ConfigurationManager, Version=4.0.1.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\packages\System.Configuration.ConfigurationManager.4.5.0\lib\net461\System.Configuration.ConfigurationManager.dll</HintPath>
</Reference>
<Reference Include="System.Core" />
<Reference Include="System.Data.OracleClient" />
<Reference Include="System.Diagnostics.EventLog, Version=4.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\packages\System.Diagnostics.EventLog.4.5.0\lib\net461\System.Diagnostics.EventLog.dll</HintPath>
</Reference>
<Reference Include="System.Drawing" />
<Reference Include="System.Net" />
<Reference Include="System.Security.AccessControl, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.AccessControl.4.5.0\lib\net461\System.Security.AccessControl.dll</HintPath>
</Reference>
<Reference Include="System.Security.Permissions, Version=4.0.1.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.Permissions.4.5.0\lib\net461\System.Security.Permissions.dll</HintPath>
</Reference>
<Reference Include="System.Security.Principal.Windows, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.Principal.Windows.4.5.0\lib\net461\System.Security.Principal.Windows.dll</HintPath>
</Reference>
<Reference Include="System.ServiceProcess" />
<Reference Include="System.Transactions" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="iTextSharpPdfTextractor.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="app.config" />
<None Include="packages.config" />
<None Include="Readme.md" />
<None Include="SenseNet.TextExtractors.Pdf.nuspec" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\ContentRepository\SenseNet.ContentRepository.csproj">
<Project>{786e6165-ca02-45a9-bf58-207a45d7d6df}</Project>
<Name>SenseNet.ContentRepository</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
</Project>
25 changes: 25 additions & 0 deletions src/TextExtractors.Pdf/SenseNet.TextExtractors.Pdf.nuspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?xml version="1.0"?>
<package >
<metadata>
<id>SenseNet.TextExtractors.Pdf</id>
<version>7.0.0</version>
<title>sensenet iTextSharp pdf text extractor</title>
<authors>kavics,tusmester</authors>
<owners>Sense/Net</owners>
<licenseUrl>https://github.com/SenseNet/sensenet/blob/master/LICENSE</licenseUrl>
<projectUrl>https://github.com/SenseNet/sensenet</projectUrl>
<iconUrl>https://raw.githubusercontent.com/SenseNet/sn-resources/master/images/sn-icon/sensenet-icon-64.png</iconUrl>
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<description>Legacy iTextSharp PDF text extractor for the sensenet platform.</description>
<releaseNotes>See release notes on GitHub.</releaseNotes>
<copyright>Copyright © Sense/Net Inc.</copyright>
<tags>sensenet ecm ecms pdf textextractor</tags>
<dependencies>
<dependency id="SenseNet.Services" version="7.5.1.3" />
</dependencies>
</metadata>
<files>
<file src="bin\Release\itextsharp.*" target="lib\net461" />
<file src="bin\Release\SenseNet.TextExtractors.Pdf.*" target="lib\net461" />
</files>
</package>
11 changes: 11 additions & 0 deletions src/TextExtractors.Pdf/app.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<dependentAssembly>
<assemblyIdentity name="Newtonsoft.Json" publicKeyToken="30ad4fe6b2a6aeed" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-11.0.0.0" newVersion="11.0.0.0" />
</dependentAssembly>
</assemblyBinding>
</runtime>
</configuration>
Loading

0 comments on commit a994d07

Please sign in to comment.