Skip to content

Commit e230a78

Browse files
committed
Initial commit
0 parents  commit e230a78

15 files changed

+436
-0
lines changed

.classpath

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<classpath>
3+
<classpathentry kind="src" path="src/main/java"/>
4+
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
5+
<attributes>
6+
<attribute name="maven.pomderived" value="true"/>
7+
</attributes>
8+
</classpathentry>
9+
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER">
10+
<attributes>
11+
<attribute name="module" value="true"/>
12+
<attribute name="maven.pomderived" value="true"/>
13+
</attributes>
14+
</classpathentry>
15+
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
16+
<attributes>
17+
<attribute name="maven.pomderived" value="true"/>
18+
</attributes>
19+
</classpathentry>
20+
<classpathentry kind="output" path="target/classes"/>
21+
</classpath>

.gitignore

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<<<<<<< HEAD
2+
HELP.md
3+
target/
4+
!.mvn/wrapper/maven-wrapper.jar
5+
!**/src/main/**/target/
6+
!**/src/test/**/target/
7+
8+
### STS ###
9+
.apt_generated
10+
.classpath
11+
.factorypath
12+
.project
13+
.settings
14+
.springBeans
15+
.sts4-cache
16+
17+
### IntelliJ IDEA ###
18+
.idea
19+
*.iws
20+
*.iml
21+
*.ipr
22+
23+
### NetBeans ###
24+
/nbproject/private/
25+
/nbbuild/
26+
/dist/
27+
/nbdist/
28+
/.nb-gradle/
29+
build/
30+
!**/src/main/**/build/
31+
!**/src/test/**/build/
32+
33+
### VS Code ###
34+
.vscode/
35+
36+
/target
37+
.mvn
38+
maven-wrapper
39+
mvnw.cmd
40+
=======
41+
# Compiled class file
42+
*.class
43+
44+
# Log file
45+
*.log
46+
47+
# BlueJ files
48+
*.ctxt
49+
50+
# Mobile Tools for Java (J2ME)
51+
.mtj.tmp/
52+
53+
# Package Files #
54+
*.jar
55+
*.war
56+
*.nar
57+
*.ear
58+
*.zip
59+
*.tar.gz
60+
*.rar
61+
62+
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
63+
hs_err_pid*

.project

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<projectDescription>
3+
<name>ebird-csv-parser</name>
4+
<comment></comment>
5+
<projects>
6+
</projects>
7+
<buildSpec>
8+
<buildCommand>
9+
<name>org.eclipse.jdt.core.javabuilder</name>
10+
<arguments>
11+
</arguments>
12+
</buildCommand>
13+
<buildCommand>
14+
<name>org.eclipse.m2e.core.maven2Builder</name>
15+
<arguments>
16+
</arguments>
17+
</buildCommand>
18+
</buildSpec>
19+
<natures>
20+
<nature>org.eclipse.jdt.core.javanature</nature>
21+
<nature>org.eclipse.m2e.core.maven2Nature</nature>
22+
</natures>
23+
</projectDescription>
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
eclipse.preferences.version=1
2+
org.eclipse.jdt.apt.aptEnabled=false
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
eclipse.preferences.version=1
2+
org.eclipse.jdt.core.compiler.codegen.targetPlatform=17
3+
org.eclipse.jdt.core.compiler.compliance=17
4+
org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
5+
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
6+
org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
7+
org.eclipse.jdt.core.compiler.processAnnotations=disabled
8+
org.eclipse.jdt.core.compiler.release=enabled
9+
org.eclipse.jdt.core.compiler.source=17
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
activeProfiles=
2+
eclipse.preferences.version=1
3+
resolveWorkspaceProjects=true
4+
version=1

pom.xml

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
2+
<modelVersion>4.0.0</modelVersion>
3+
<groupId>fun.seabird</groupId>
4+
<artifactId>ebird-csv-parser</artifactId>
5+
<version>0.0.1-SNAPSHOT</version>
6+
<name>ebird-csv-parser</name>
7+
8+
<build>
9+
<plugins>
10+
<!-- any other plugins -->
11+
<plugin>
12+
<artifactId>maven-assembly-plugin</artifactId>
13+
<executions>
14+
<execution>
15+
<phase>package</phase>
16+
<goals>
17+
<goal>single</goal>
18+
</goals>
19+
</execution>
20+
</executions>
21+
<configuration>
22+
<archive>
23+
<manifest>
24+
<mainClass>fun.seabird.MediaSorter</mainClass>
25+
</manifest>
26+
</archive>
27+
<descriptorRefs>
28+
<descriptorRef>jar-with-dependencies</descriptorRef>
29+
</descriptorRefs>
30+
</configuration>
31+
</plugin>
32+
33+
<plugin>
34+
<groupId>org.apache.maven.plugins</groupId>
35+
<artifactId>maven-compiler-plugin</artifactId>
36+
<version>3.11.0</version>
37+
<configuration>
38+
<release>17</release>
39+
</configuration>
40+
</plugin>
41+
42+
</plugins>
43+
</build>
44+
45+
46+
<dependencies>
47+
48+
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
49+
<dependency>
50+
<groupId>com.google.guava</groupId>
51+
<artifactId>guava</artifactId>
52+
<version>31.1-jre</version>
53+
</dependency>
54+
55+
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
56+
<dependency>
57+
<groupId>org.apache.commons</groupId>
58+
<artifactId>commons-lang3</artifactId>
59+
<version>3.12.0</version>
60+
</dependency>
61+
62+
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-api -->
63+
<dependency>
64+
<groupId>org.slf4j</groupId>
65+
<artifactId>slf4j-api</artifactId>
66+
<version>2.0.7</version>
67+
</dependency>
68+
69+
<!-- https://mvnrepository.com/artifact/io.projectreactor/reactor-core -->
70+
<dependency>
71+
<groupId>io.projectreactor</groupId>
72+
<artifactId>reactor-core</artifactId>
73+
<version>3.5.5</version>
74+
</dependency>
75+
76+
<dependency>
77+
<groupId>org.apache.commons</groupId>
78+
<artifactId>commons-csv</artifactId>
79+
<version>1.10.0</version>
80+
</dependency>
81+
82+
<dependency>
83+
<groupId>org.projectlombok</groupId>
84+
<artifactId>lombok</artifactId>
85+
<version>1.18.26</version>
86+
<scope>provided</scope>
87+
</dependency>
88+
89+
90+
</dependencies>
91+
92+
</project>
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
package fun.seabird;
2+
3+
import java.io.IOException;
4+
import java.io.Reader;
5+
import java.nio.file.Files;
6+
import java.nio.file.Path;
7+
import java.time.LocalDate;
8+
import java.time.LocalDateTime;
9+
import java.time.LocalTime;
10+
import java.time.format.DateTimeFormatter;
11+
import java.util.Arrays;
12+
import java.util.Comparator;
13+
import java.util.List;
14+
import java.util.concurrent.TimeUnit;
15+
import java.util.concurrent.atomic.AtomicInteger;
16+
import java.util.function.Consumer;
17+
18+
import org.apache.commons.csv.CSVFormat;
19+
import org.apache.commons.csv.CSVParser;
20+
import org.apache.commons.csv.CSVRecord;
21+
import org.apache.commons.lang3.StringUtils;
22+
import org.apache.commons.lang3.time.StopWatch;
23+
import org.slf4j.Logger;
24+
import org.slf4j.LoggerFactory;
25+
26+
import reactor.core.publisher.Flux;
27+
import reactor.core.scheduler.Schedulers;
28+
29+
public class EbirdCsvParser
30+
{
31+
private static final Logger logger = LoggerFactory.getLogger(EbirdCsvParser.class);
32+
33+
public enum ParseMode {single,parallel}
34+
35+
public enum PreSort {none,obsDt}
36+
37+
private static final DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("hh:mm a");
38+
39+
private static final DateTimeFormatter csvDtf = DateTimeFormatter.ofPattern("yyyy-MM-dd hh:mm a");
40+
41+
private static final AtomicInteger linesProcessed = new AtomicInteger(0);
42+
43+
static LocalDateTime parseSubDate(CSVRecord record)
44+
{
45+
if (record.getRecordNumber() == 1l)
46+
return LocalDateTime.MIN;
47+
48+
String obsTimeStr = record.get(12);
49+
if (StringUtils.isBlank(obsTimeStr))
50+
obsTimeStr = "12:00 AM";
51+
52+
return LocalDateTime.parse(record.get(11) + " " + obsTimeStr, csvDtf);
53+
}
54+
55+
private static EbirdCsvRow parseCsvLine(CSVRecord record)
56+
{
57+
if (record.getRecordNumber() == 1l)
58+
return null; // skip the header
59+
60+
EbirdCsvRow row = new EbirdCsvRow();
61+
62+
row.setSubId(record.get(0));
63+
row.setCommonName(record.get(1));
64+
row.setSciName(record.get(2));
65+
row.setTaxonOrder(Double.parseDouble(record.get(3)));
66+
row.setCount(record.get(4));
67+
row.setSubnat1Code(record.get(5));
68+
row.setSubnat2Name(record.get(6));
69+
row.setLocId(record.get(7));
70+
row.setLocName(record.get(8));
71+
row.setLat(Double.parseDouble(record.get(9)));
72+
row.setLng(Double.parseDouble(record.get(10)));
73+
row.setDate(LocalDate.parse(record.get(11))); // Assuming the date format is ISO-8601 (yyyy-MM-dd)
74+
75+
String timeStr = record.get(12);
76+
if (!StringUtils.isEmpty(timeStr))
77+
row.setTime(LocalTime.parse(timeStr,timeFormatter)); // Assuming the time format is ISO-8601 (HH:mm:ss)
78+
79+
row.setProtocol(record.get(13));
80+
81+
if (!record.get(14).isEmpty())
82+
row.setDuration(Integer.parseInt(record.get(14)));
83+
else
84+
row.setDuration(0);
85+
86+
row.setCompleteChecklist(record.get(15).equals("1"));
87+
88+
if (record.size() > 16 && !record.get(16).isEmpty()) {
89+
row.setDistanceKm(Double.parseDouble(record.get(16)));
90+
}
91+
92+
if (record.size() > 17 && !record.get(17).isEmpty()) {
93+
row.setAreaHa(Double.parseDouble(record.get(17)));
94+
}
95+
96+
if (record.size() > 18 && !record.get(18).isEmpty())
97+
row.setPartySize(Integer.parseInt(record.get(18)));
98+
99+
if (record.size() > 19)
100+
row.setBreedingCode(record.get(19));
101+
102+
// Parsing the space-separated String into a List of Long values
103+
if (record.size() > 22) {
104+
String assetIdsString = record.get(22);
105+
List<Long> assetIds = Arrays.stream(assetIdsString.split(" "))
106+
.map(Long::parseLong)
107+
.toList();
108+
row.setAssetIds(assetIds);
109+
}
110+
111+
return row;
112+
}
113+
114+
/**
115+
* Parses eBird CSV file using Apache Commons CSV library, and processes each line in parallel.
116+
*
117+
* @param csvFile The path to the CSV file to be parsed.
118+
* @throws IOException If an I/O error occurs while reading the CSV file.
119+
*/
120+
public static void parseCsv(Path csvFile,ParseMode mode,PreSort preSort,Consumer<EbirdCsvRow> rowProcessor) throws IOException
121+
{
122+
logger.info("Parsing " + csvFile + "...");
123+
124+
linesProcessed.set(0);
125+
126+
try (Reader fileReader = Files.newBufferedReader(csvFile);
127+
CSVParser csvParser = new CSVParser(fileReader,
128+
CSVFormat.DEFAULT.builder().setSkipHeaderRecord(true).build())) {
129+
130+
StopWatch stopwatch = StopWatch.createStarted();
131+
132+
Iterable<CSVRecord> records;
133+
if (PreSort.obsDt == preSort)
134+
{
135+
// Read all lines and sort by date and time columns
136+
List<CSVRecord> recordsList = csvParser.getRecords();
137+
recordsList.sort(Comparator.comparing(EbirdCsvParser::parseSubDate));
138+
logger.info("Read " + (recordsList.size()-1) + " and sorted eBird observations in " + stopwatch.getTime(TimeUnit.SECONDS) + " seconds");
139+
records = recordsList;
140+
}
141+
else
142+
records = csvParser;
143+
144+
Consumer<CSVRecord> csvRecordConsumer = new Consumer<CSVRecord>() {
145+
@Override
146+
public void accept(CSVRecord record)
147+
{
148+
EbirdCsvRow row = parseCsvLine(record);
149+
if (row == null) //header row
150+
return;
151+
152+
rowProcessor.accept(row);
153+
linesProcessed.getAndIncrement();
154+
}
155+
};
156+
157+
switch (mode)
158+
{
159+
case parallel:
160+
Flux.fromIterable(records).parallel().runOn(Schedulers.parallel()).sequential(25000).doOnNext(csvRecordConsumer).then().block();
161+
break;
162+
case single:
163+
Flux.fromIterable(records).doOnNext(csvRecordConsumer).then().block();
164+
break;
165+
}
166+
167+
stopwatch.stop();
168+
logger.info("Processed " + linesProcessed.get() + " eBird observations in " + stopwatch.getTime(TimeUnit.SECONDS) + " seconds");
169+
}
170+
}
171+
172+
}

0 commit comments

Comments
 (0)