-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathBatchParser.cs
186 lines (169 loc) · 8.99 KB
/
BatchParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// Copyright © 2023 Textkernel BV. All rights reserved.
// This file is provided for use by, or on behalf of, Textkernel licensees
// within the terms of their license of Textkernel products or Textkernel customers
// within the Terms of Service pertaining to the Textkernel SaaS products.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Textkernel.Tx.Models;
using Textkernel.Tx.Models.API.Parsing;
using System.Threading.Tasks;
namespace Textkernel.Tx.Batches
{
/// <summary>
/// Thrown when the files found do not meet the criteria for a valid batch. See <see cref="BatchParsingRules"/>
/// </summary>
public class TxInvalidBatchException : Exception
{
internal TxInvalidBatchException(string message) : base(message) { }
}
/// <summary>
/// A utility to parse batches of files.
/// </summary>
public class BatchParser
{
/// <summary>
/// Parses a batch of resumes
/// </summary>
/// <param name="apiClient">The API client to use to parse the files</param>
/// <param name="parseOptions">Any parsing/indexing options</param>
/// <param name="rules">
/// The rules that should be applied to whatever files are found prior to parsing.
/// This is important to reduce the number of invalid parse API calls and reduce parsing costs.
/// </param>
/// <param name="directory">The directory containing the files to be parsed</param>
/// <param name="searchOption"></param>
/// <param name="successCallback">A callback for when a file is parsed successfully</param>
/// <param name="partialSuccessCallback">A callback for when some error happened during/after parsing, but there is still usable data in the response</param>
/// <param name="errorCallback">A callback for when an error occurred when parsing the file, and there is no usable data</param>
/// <param name="generateDocumentIdFn">A callback so you can specify a DocumentId for each file that is parsed</param>
/// <exception cref="TxInvalidBatchException">Thrown when the directory provided does not meet the <see cref="BatchParsingRules"/></exception>
public static async Task ParseResumes(
TxClient apiClient,
ParseOptions parseOptions,
BatchParsingRules rules,
string directory,
SearchOption searchOption,
Func<ResumeBatchSuccessResult, Task> successCallback,
Func<ResumeBatchPartialSuccessResult, Task> partialSuccessCallback,
Func<BatchErrorResult, Task> errorCallback,
Func<string, string> generateDocumentIdFn = null)
{
if (apiClient == null) throw new ArgumentNullException(nameof(apiClient));
IEnumerable<string> files = GetFiles(rules, directory, searchOption);
//process the batch serially, since multi-threading could cause the customer to violate the AUP accidentally
foreach (string file in files)
{
Document doc = new Document(file);
string docId = generateDocumentIdFn == null ? Guid.NewGuid().ToString() : generateDocumentIdFn(file);
try
{
//set document id if we plan to index these documents
if (parseOptions?.IndexingOptions != null)
parseOptions.IndexingOptions.DocumentId = docId;
ParseRequest request = new ParseRequest(doc, parseOptions);
ParseResumeResponse response = await apiClient.ParseResume(request);
if (successCallback != null)
{
await successCallback(new ResumeBatchSuccessResult(file, docId, response));
}
}
catch (TxUsableResumeException e)
{
//this happens when something wasn't 100% successful, but there still might be usable data
if (partialSuccessCallback != null)
{
await partialSuccessCallback(new ResumeBatchPartialSuccessResult(file, docId, e));
}
}
catch (TxException e)
{
//this happens where there is no usable data
if (errorCallback != null)
{
await errorCallback(new BatchErrorResult(file, docId, e));
}
}
}
}
/// <summary>
/// Parses a batch of jobs
/// </summary>
/// <param name="apiClient">The API client to use to parse the files</param>
/// <param name="parseOptions">Any parsing/indexing options</param>
/// <param name="rules">
/// The rules that should be applied to whatever files are found prior to parsing.
/// This is important to reduce the number of invalid parse API calls and reduce parsing costs.
/// </param>
/// <param name="directory">The directory containing the files to be parsed</param>
/// <param name="searchOption"></param>
/// <param name="successCallback">A callback for when a file is parsed successfully</param>
/// <param name="partialSuccessCallback">A callback for when some error happened during/after parsing, but there is still usable data in the response</param>
/// <param name="errorCallback">A callback for when an error occurred when parsing the file, and there is no usable data</param>
/// <param name="generateDocumentIdFn">A callback so you can specify a DocumentId for each file that is parsed</param>
/// <exception cref="TxInvalidBatchException">Thrown when the directory provided does not meet the <see cref="BatchParsingRules"/></exception>
public static async Task ParseJobs(
TxClient apiClient,
ParseOptions parseOptions,
BatchParsingRules rules,
string directory,
SearchOption searchOption,
Func<JobBatchSuccessResult, Task> successCallback,
Func<JobBatchPartialSuccessResult, Task> partialSuccessCallback,
Func<BatchErrorResult, Task> errorCallback,
Func<string, string> generateDocumentIdFn = null)
{
if (apiClient == null) throw new ArgumentNullException(nameof(apiClient));
IEnumerable<string> files = GetFiles(rules, directory, searchOption);
//process the batch serially, since multi-threading could cause the customer to violate the AUP accidentally
foreach (string file in files)
{
Document doc = new Document(file);
string docId = generateDocumentIdFn == null ? Guid.NewGuid().ToString() : generateDocumentIdFn(file);
try
{
//set document id if we plan to index these documents
if (parseOptions?.IndexingOptions != null)
parseOptions.IndexingOptions.DocumentId = docId;
ParseRequest request = new ParseRequest(doc, parseOptions);
ParseJobResponse response = await apiClient.ParseJob(request);
if (successCallback != null)
{
await successCallback(new JobBatchSuccessResult(file, docId, response));
}
}
catch (TxUsableJobException e)
{
//this happens when something wasn't 100% successful, but there still might be usable data
if (partialSuccessCallback != null)
{
await partialSuccessCallback(new JobBatchPartialSuccessResult(file, docId, e));
}
}
catch (TxException e)
{
if (errorCallback != null)
{
//this happens where there is no usable data
await errorCallback(new BatchErrorResult(file, docId, e));
}
}
}
}
private static IEnumerable<string> GetFiles(BatchParsingRules rules, string directory, SearchOption searchOption)
{
if (string.IsNullOrWhiteSpace(directory)) throw new ArgumentNullException(nameof(directory));
if (rules == null) throw new ArgumentNullException(nameof(rules));
//use the rules to determine if this batch of files is valid (and which files)
IEnumerable<string> files = Directory.EnumerateFiles(directory, "*", searchOption)
.Where(f => rules.FileIsAllowed(f));
if (files == null || files.Count() == 0) throw new TxInvalidBatchException("No files found in given directory");
if (files.Count() > rules.MaxBatchSize)
{
throw new TxInvalidBatchException("This batch is too large to process.");
}
return files;
}
}
}