summaryrefslogtreecommitdiff
path: root/src/connectors/src/data/io/csv/CSVDataReader.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/connectors/src/data/io/csv/CSVDataReader.java')
-rw-r--r--src/connectors/src/data/io/csv/CSVDataReader.java248
1 files changed, 248 insertions, 0 deletions
diff --git a/src/connectors/src/data/io/csv/CSVDataReader.java b/src/connectors/src/data/io/csv/CSVDataReader.java
new file mode 100644
index 0000000..6dbc8ff
--- /dev/null
+++ b/src/connectors/src/data/io/csv/CSVDataReader.java
@@ -0,0 +1,248 @@
+/*
+ * SSSync, a Simple and Stupid Synchronizer for data with multi-valued attributes
+ * Copyright (C) 2014 Ludovic Pouzenc <ludovic@pouzenc.fr>
+ *
+ * This file is part of SSSync.
+ *
+ * SSSync is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SSSync is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SSSync. If not, see <http://www.gnu.org/licenses/>
+ */
+package data.io.csv;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+
+import data.MVDataEntry;
+import data.io.AbstractMVDataReader;
+
+/**
+ * Stream-oriented reader from a particular CSV file.
+ * Always returns lines/items sorted by lexicographical ascending key.
+ *
+ * @author lpouzenc
+ */
+public class CSVDataReader extends AbstractMVDataReader {
+
+ public static final String CSV_DEMO =
+ //"key,attr,values\n" +
+ "line3,hello,all;the;others\n" +
+ "line1,from,csv1;csv1bis\n" +
+ "line2,hello,all;the;world\n" +
+ "line1,attr2,csv1\n" +
+ ",,\n";
+
+ public static final CSVFormat DEFAULT_CSV_FORMAT = CSVFormat.EXCEL
+ .withHeader("key","attr","values")
+ .withIgnoreSurroundingSpaces(true);
+
+ private final CSVFormat format;
+ private final Reader dataSourceStream;
+
+ private transient MVDataEntry nextEntry;
+ private transient CSVRecord nextCSVRecord;
+ private transient Iterator<CSVRecord> csvIt;
+
+
+ /**
+ * Constructs a CSVDataReader object for parsing a CSV input given via dataSourceStream.
+ * @param dataSourceName A short string representing this reader (for logging)
+ * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader
+ * @param alreadySorted If false, memory cost is around 3 times the CSV file size !
+ * @param format Specify the exact format used to encode the CSV file (separators, escaping...)
+ * @throws IOException
+ */
+ public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted, CSVFormat format) throws IOException {
+ this.dataSourceName = dataSourceName;
+ this.format = format;
+
+ if ( alreadySorted ) {
+ this.dataSourceStream = dataSourceStream;
+ } else {
+ BufferedReader bufReader;
+ if ( dataSourceStream instanceof BufferedReader ) {
+ bufReader = (BufferedReader) dataSourceStream;
+ } else {
+ bufReader = new BufferedReader(dataSourceStream);
+ }
+ this.dataSourceStream = readAndSortLines(bufReader);
+ }
+ }
+
+ /**
+ * Constructs a CSVDataReader object with default CSV format (for CSVParser).
+ * @param dataSourceName A short string representing this reader (for logging)
+ * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader
+ * @param alreadySorted If false, memory cost is around 3 times the CSV file size !
+ * @throws IOException
+ */
+ public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted) throws IOException {
+ this(dataSourceName, dataSourceStream, alreadySorted, DEFAULT_CSV_FORMAT);
+ }
+
+ /**
+ * {@inheritDoc}
+ * Note : multiple iterators on the same instance are not supported
+ */
+ @Override
+ public Iterator<MVDataEntry> iterator() {
+ // When a new iterator is requested, everything should be reset
+ CSVParser parser;
+ try {
+ dataSourceStream.reset();
+ parser = new CSVParser(dataSourceStream, format);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ csvIt = parser.iterator();
+ nextCSVRecord = null;
+ nextEntry = null;
+ return this;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean hasNext() {
+ if ( nextEntry == null ) {
+ lookAhead();
+ }
+ return ( nextEntry != null );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public MVDataEntry next() {
+ if ( !hasNext() ) {
+ throw new NoSuchElementException();
+ }
+ // Pop the lookahead record
+ MVDataEntry res = nextEntry;
+ nextEntry=null;
+ // And return it
+ return res;
+ }
+
+ /**
+ * In-memory File sorting, return as a single String
+ * @param reader
+ * @return
+ * @throws IOException
+ */
+ private Reader readAndSortLines(BufferedReader bufReader) throws IOException {
+ // Put all the CSV in memory, in a SortedSet
+ SortedSet<String> lineSet = new TreeSet<String>();
+ String inputLine;
+ int totalCSVSize=0;
+ while ((inputLine = bufReader.readLine()) != null) {
+ lineSet.add(inputLine);
+ totalCSVSize += inputLine.length() + 1;
+ }
+ bufReader.close(); // Closes also dataSourceStream
+
+ // Put all sorted lines in a String
+ StringBuilder allLines = new StringBuilder(totalCSVSize);
+ for ( String line: lineSet) {
+ allLines.append(line + "\n");
+ }
+ lineSet = null; // Could help the GC if the input file is huge
+
+ // Build a Java Reader from that String
+ return new StringReader(allLines.toString());
+ }
+
+ /**
+ * A MVDataEntry could be represented on many CSV lines.
+ * The key is repeated, the attr could change, the values should change (for given key/attr pair)
+ */
+ private void lookAhead() {
+ MVDataEntry currEntry = null;
+
+ boolean abort=(nextCSVRecord==null && !csvIt.hasNext()); // Nothing to crunch
+ boolean done=(nextEntry!=null); // Already looked ahead
+ while (!abort && !done) {
+ // Try to get a valid CSVRecord
+ if ( nextCSVRecord == null ) {
+ nextCSVRecord = nextValidCSVRecord();
+ }
+ // If no more CSV data
+ if ( nextCSVRecord == null ) {
+ // Maybe we have a remaining entry to return
+ if ( currEntry != null ) {
+ done=true; continue;
+ } else {
+ abort=true; continue;
+ }
+ }
+
+ // Now we have a valid CSV line to put in a MVDataEntry
+ String newKey = nextCSVRecord.get("key");
+
+
+ // If no MVDataEntry yet, it's time to create it (we have data to put into)
+ if ( currEntry == null ) {
+ currEntry = new MVDataEntry(newKey);
+ }
+ // If CSV line key matches MVDataEntry key, appends attr/values on it
+ // XXX Tricky code : following condition is always true if the previous one is true
+ if ( currEntry.getKey().equals(newKey) ) {
+ currEntry.splitAndPut(nextCSVRecord.get("attr"), nextCSVRecord.get("values"), ";");
+ nextCSVRecord = null; // Record consumed
+ } else {
+ // Keys are different, we are done (and we have remaining CSV data in nextCSVRecord)
+ done=true; continue;
+ }
+ }
+
+ nextEntry = done?currEntry:null;
+ }
+
+ /**
+ * Seek for the next valid record in the CSV file
+ * @return the next valid CSVRecord
+ */
+ private CSVRecord nextValidCSVRecord() {
+ CSVRecord res = null;
+ boolean abort = !csvIt.hasNext();
+ boolean done = false;
+ while (!abort && !done) {
+ // Try to read a CSV line
+ res = (csvIt.hasNext())?csvIt.next():null;
+
+ // Break if nothing readable
+ if ( res == null ) {
+ abort=true; continue;
+ }
+
+ // Skip invalid and empty lines
+ String key = res.get("key");
+ if ( key != null && ! key.isEmpty() ) {
+ done=true; continue;
+ }
+ }
+
+ return done?res:null;
+ }
+}