001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.io.dataformat.ColumnDescription;
015    import org.maltparser.core.io.dataformat.DataFormatException;
016    import org.maltparser.core.io.dataformat.DataFormatInstance;
017    import org.maltparser.core.syntaxgraph.DependencyStructure;
018    import org.maltparser.core.syntaxgraph.Element;
019    import org.maltparser.core.syntaxgraph.TokenStructure;
020    import org.maltparser.core.syntaxgraph.edge.Edge;
021    /**
022    *
023    *
024    * @author Johan Hall
025    */
026    public class TabReader implements SyntaxGraphReader {
027            private BufferedReader reader;
028            private int sentenceCount;
029            private final StringBuilder input;
030            private DataFormatInstance dataFormatInstance;
031            private static final String IGNORE_COLUMN_SIGN = "_";
032            private static final char TAB = '\t';
033            private static final char NEWLINE = '\n';
034            private static final char CARRIAGE_RETURN = '\r';
035            private String fileName = null;
036            private URL url = null;
037            private String charsetName;
038            private int nIterations;
039            private int cIterations;
040            private boolean closeStream = true;
041            
042            public TabReader() { 
043                    input = new StringBuilder();
044                    nIterations = 1;
045                    cIterations = 1;
046            }
047            
048            private void reopen() throws MaltChainedException {
049                    close();
050                    if (fileName != null) {
051                            open(fileName, charsetName);
052                    } else if (url != null) {
053                            open(url, charsetName);
054                    } else {
055                            throw new DataFormatException("The input stream cannot be reopen. ");
056                    }
057            }
058            
059            public void open(String fileName, String charsetName) throws MaltChainedException {
060                    setFileName(fileName);
061                    setCharsetName(charsetName);
062                    try {
063                            open(new FileInputStream(fileName), charsetName);
064                    } catch (FileNotFoundException e) {
065                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
066                    }
067            }
068            
069            public void open(URL url, String charsetName) throws MaltChainedException {
070                    setUrl(url);
071                    setCharsetName(charsetName);
072                    if (url == null) {
073                            throw new DataFormatException("The input file cannot be found. ");
074                    }
075                    try {
076                            open(url.openStream(), charsetName);
077                    } catch (IOException e) {
078                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
079                    }
080            }
081            
082            public void open(InputStream is, String charsetName) throws MaltChainedException {
083                    try {
084                            if (is == System.in) {
085                                    closeStream = false;
086                            }
087                            open(new InputStreamReader(is, charsetName));
088                    } catch (UnsupportedEncodingException e) {
089                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
090                    }
091            }
092            
093            private void open(InputStreamReader isr) throws MaltChainedException {
094                    setReader(new BufferedReader(isr));
095                    setSentenceCount(0);
096            }
097            
098            public void readProlog() throws MaltChainedException {
099                    
100            }
101            
102            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
103                    if (syntaxGraph == null || dataFormatInstance == null) {
104                            return false;
105                    }
106                    
107                    Element node = null;
108                    Edge edge = null;
109                    input.setLength(0);
110                    int i = 0;
111                    int terminalCounter = 0;
112                    int nNewLines = 0;
113                    syntaxGraph.clear();
114                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
115                    while (true) {
116                            int c;
117    
118                            try {
119                                    c = reader.read();
120                            } catch (IOException e) {
121                                    close();
122                                    throw new DataFormatException("Error when reading from the input file. ", e);
123                            }
124                            if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
125                                    if (input.length() != 0) {                                      
126                                            if (i == 0) {
127                                                    terminalCounter++;
128                                                    node = syntaxGraph.addTokenNode(terminalCounter);
129                                            }
130                                            ColumnDescription column = null;
131                                            if (columns.hasNext()) {
132                                                    column = columns.next();
133                                                    if (column.getCategory() == ColumnDescription.INPUT && node != null) {
134                                                            syntaxGraph.addLabel(node, column.getName(), input.toString());
135                                                    } else if (column.getCategory() == ColumnDescription.HEAD) {
136                                                            if (syntaxGraph instanceof DependencyStructure) {
137                                                                    if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
138                                                                    //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
139                                                                            edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
140                                                                    }
141                                                            } 
142                                                            else {
143                                                                    close();
144                                                                    throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
145                                                            }
146                                                    } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
147                                                            //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
148                                                                    syntaxGraph.addLabel(edge, column.getName(), input.toString());
149                                                            //} // bugfix
150                                                    }
151                                            }
152                                            input.setLength(0);
153                                            nNewLines = 0;
154                                            i++;
155                                    } else if (c == TAB) {
156                                            throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
157                                    }
158                                    if (c == NEWLINE) {
159                                            nNewLines++;
160                                            i = 0;
161                                            columns = dataFormatInstance.iterator();
162                                    }
163                            } else {
164                                    input.append((char)c);
165                            }
166                            
167                            if (nNewLines == 2 && c == NEWLINE) {
168                                    if (syntaxGraph.hasTokens()) {
169                                            sentenceCount++;
170                                    }
171                                    return true;
172                            } else if (c == -1) {
173                                    if (syntaxGraph.hasTokens()) {
174                                            sentenceCount++;
175                                    }
176                                    if (cIterations < nIterations) {
177                                            cIterations++;
178                                            reopen();
179                                            return true;
180                                    }
181                                    
182                                    return false;                                   
183                            }
184                    }
185            }
186            
187            public void readEpilog() throws MaltChainedException {
188                    
189            }
190            
191            public BufferedReader getReader() {
192                    return reader;
193            }
194    
195            public void setReader(BufferedReader reader) throws MaltChainedException {
196                    close();
197                    this.reader = reader;
198            }
199            
200            public DataFormatInstance getDataFormatInstance() {
201                    return dataFormatInstance;
202            }
203    
204            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
205                    this.dataFormatInstance = dataFormatInstance;
206            }
207    
208            public int getSentenceCount() throws MaltChainedException {
209                    return sentenceCount;
210            }
211            
212            public void setSentenceCount(int sentenceCount) {
213                    this.sentenceCount = sentenceCount;
214            }
215            
216            public String getOptions() {
217                    return null;
218            }
219            
220            public void setOptions(String optionString) throws MaltChainedException {
221                    
222            }
223            
224            public String getFileName() {
225                    return fileName;
226            }
227    
228            public void setFileName(String fileName) {
229                    this.fileName = fileName;
230            }
231    
232            public URL getUrl() {
233                    return url;
234            }
235    
236            public void setUrl(URL url) {
237                    this.url = url;
238            }
239    
240            public String getCharsetName() {
241                    return charsetName;
242            }
243    
244            public void setCharsetName(String charsetName) {
245                    this.charsetName = charsetName;
246            }
247    
248            public int getNIterations() {
249                    return nIterations;
250            }
251    
252            public void setNIterations(int iterations) {
253                    nIterations = iterations;
254            }
255    
256            public int getIterationCounter() {
257                    return cIterations;
258            }
259    
260            public void close() throws MaltChainedException {
261                    try {
262                            if (reader != null) {
263                                    if (closeStream) {
264                                            reader.close();
265                                    }
266                                    reader = null;
267                            }
268                    } catch (IOException e) {
269                            throw new DataFormatException("Error when closing the input file. ", e);
270                    } 
271            }
272            
273            public void clear() throws MaltChainedException {
274                    close();
275                    input.setLength(0);
276                    dataFormatInstance = null;
277                    sentenceCount = 0;
278            }
279    }