@@ -38,6 +38,11 @@ public interface Task
38
38
@ Config ("total_file_count_limit" )
39
39
@ ConfigDefault ("2147483647" )
40
40
int getTotalFileCountLimit ();
41
+
42
+ // TODO support more algorithms to combine tasks
43
+ @ Config ("min_task_size" )
44
+ @ ConfigDefault ("0" )
45
+ long getMinTaskSize ();
41
46
}
42
47
43
48
public static class Entry
@@ -69,22 +74,25 @@ public static class Builder
69
74
private String last = null ;
70
75
71
76
private int limitCount = Integer .MAX_VALUE ;
77
+ private long minTaskSize = 1 ;
72
78
private Pattern pathMatchPattern ;
73
79
74
80
private final ByteBuffer castBuffer = ByteBuffer .allocate (4 );
75
81
76
82
public Builder (Task task )
77
83
{
78
84
this ();
79
- this .limitCount = task .getTotalFileCountLimit ();
80
85
this .pathMatchPattern = Pattern .compile (task .getPathMatchPattern ());
86
+ this .limitCount = task .getTotalFileCountLimit ();
87
+ this .minTaskSize = task .getMinTaskSize ();
81
88
}
82
89
83
90
public Builder (ConfigSource config )
84
91
{
85
92
this ();
86
93
this .pathMatchPattern = Pattern .compile (config .get (String .class , "path_match_pattern" , ".*" ));
87
94
this .limitCount = config .get (int .class , "total_file_count_limit" , Integer .MAX_VALUE );
95
+ this .minTaskSize = config .get (long .class , "min_task_size" , 0L );
88
96
}
89
97
90
98
public Builder ()
@@ -104,6 +112,12 @@ public Builder limitTotalFileCount(int limitCount)
104
112
return this ;
105
113
}
106
114
115
+ public Builder minTaskSize (long bytes )
116
+ {
117
+ this .minTaskSize = bytes ;
118
+ return this ;
119
+ }
120
+
107
121
public Builder pathMatchPattern (String pattern )
108
122
{
109
123
this .pathMatchPattern = Pattern .compile (pattern );
@@ -163,10 +177,20 @@ public FileList build()
163
177
164
178
private List <List <Entry >> getSplits (List <Entry > all )
165
179
{
166
- // TODO combine multiple entries into one task using some configuration parameters
167
180
List <List <Entry >> tasks = new ArrayList <>();
181
+ long currentTaskSize = 0 ;
182
+ List <Entry > currentTask = new ArrayList <>();
168
183
for (Entry entry : all ) {
169
- tasks .add (ImmutableList .of (entry ));
184
+ currentTask .add (entry );
185
+ currentTaskSize += entry .getSize (); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
186
+ if (currentTaskSize >= minTaskSize ) {
187
+ tasks .add (currentTask );
188
+ currentTask = new ArrayList <>();
189
+ currentTaskSize = 0 ;
190
+ }
191
+ }
192
+ if (!currentTask .isEmpty ()) {
193
+ tasks .add (currentTask );
170
194
}
171
195
return tasks ;
172
196
}
0 commit comments