-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathBatchParsingRules.cs
128 lines (115 loc) · 4.93 KB
/
BatchParsingRules.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
// Copyright © 2023 Textkernel BV. All rights reserved.
// This file is provided for use by, or on behalf of, Textkernel licensees
// within the terms of their license of Textkernel products or Textkernel customers
// within the Terms of Service pertaining to the Textkernel SaaS products.
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.IO;
using System.Linq;
namespace Textkernel.Tx.Batches
{
/// <summary>
/// Rules to limit invalid parse transactions (and reduce parsing costs).
/// </summary>
public class BatchParsingRules
{
/// <summary>
/// The default file types that will result in invalid parse transactions (and cost unnecessary credits)
/// </summary>
public static ReadOnlyCollection<string> DefaultDisallowedFileTypes = new ReadOnlyCollection<string>(new List<string>()
{
//images
"png",
"jpg",
"jpeg",
"bmp",
"tiff",
"gif",
"webp",
"psd",
"raw",
//binaries
"exe",
"dll",
"deb",
"app",
//archives
"zip",
"tar",
"gz",
"7z",
//structured
"json",
"xml",
"csv"
});
/// <summary>
/// The maximum amount of files allowed in a batch parse. If a directory contains more valid files, an error is thrown.
/// This is important to keep users from unknowingly consuming large numbers of parsing credits.
/// </summary>
public int MaxBatchSize { get; set; }
/// <summary>
/// File types to skip.
/// </summary>
public IEnumerable<string> DisallowedFileTypes { get; protected set; }
/// <summary>
/// ANY value in here will mean the 'DisallowedFileTypes' property is ignored and only types in this list are allowed
/// </summary>
public IEnumerable<string> AllowedFileTypes { get; protected set; }
/// <summary>
/// A custom function to decide whether or not a file should be parsed. It should return <see langword="true"/> to parse the file.
/// </summary>
public Func<string, bool> ShouldProcessFile { get; protected set; }
/// <summary>
/// Create a set of rules to limit invalid parse transactions (and reduce parsing costs).
/// </summary>
/// <param name="maxBatchSize">
/// The maximum amount of files allowed in a batch parse. If a directory contains more valid files, an error is thrown.
/// This is important to keep users from unknowingly consuming large numbers of parsing credits.
/// </param>
/// <param name="disallowedFileTypes">
/// File types to skip. Use the <see cref="DefaultDisallowedFileTypes"/> unless you have a specific use case.
/// </param>
/// <param name="allowedFileTypes">
/// File types to exclusively allow. ANY value in here will mean the <see cref="DisallowedFileTypes"/>
/// property is ignored and only types in this list are allowed.
/// </param>
/// <param name="shouldProcessFn">
/// A custom function to decide whether or not a file should be parsed. It should return <see langword="true"/> to parse the file.
/// This could be used, for example, to check if you have already parsed a particular
/// file in your system before spending credits to parse it again.
/// NOTE: If defined, this will be called only AFTER a file passes the other 'file type' checks.
/// </param>
public BatchParsingRules(int maxBatchSize,
IEnumerable<string> disallowedFileTypes = null,
IEnumerable<string> allowedFileTypes = null,
Func<string, bool> shouldProcessFn = null)
{
MaxBatchSize = maxBatchSize;
DisallowedFileTypes = disallowedFileTypes ?? DefaultDisallowedFileTypes;
ShouldProcessFile = shouldProcessFn;
AllowedFileTypes = allowedFileTypes ?? new List<string>();
//remove any leading periods
DisallowedFileTypes = DisallowedFileTypes.Select(s => s.TrimStart('.'));
AllowedFileTypes = AllowedFileTypes.Select(s => s.TrimStart('.'));
}
internal bool FileIsAllowed(string file)
{
string fileExt = Path.GetExtension(file);
if (AllowedFileTypes.Any() && !AllowedFileTypes.Contains(fileExt, StringComparer.OrdinalIgnoreCase))
{
return false;
}
if (DisallowedFileTypes.Any() && DisallowedFileTypes.Contains(fileExt, StringComparer.OrdinalIgnoreCase))
{
return false;
}
if (ShouldProcessFile != null)
{
return ShouldProcessFile(file);
}
return true;
}
}
}