001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    import org.maltparser.core.io.dataformat.ColumnDescription;
016    import org.maltparser.core.io.dataformat.DataFormatException;
017    import org.maltparser.core.io.dataformat.DataFormatInstance;
018    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
019    import org.maltparser.core.syntaxgraph.PhraseStructure;
020    import org.maltparser.core.syntaxgraph.TokenStructure;
021    import org.maltparser.core.syntaxgraph.edge.Edge;
022    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023    import org.maltparser.core.syntaxgraph.node.TokenNode;
024    /**
025    *
026    *
027    * @author Johan Hall
028    */
029    public class BracketReader implements SyntaxGraphReader {
030            private BufferedReader reader;
031            private DataFormatInstance dataFormatInstance;
032            private int sentenceCount;
033            private StringBuilder input;
034            private int terminalCounter; 
035            private int nonTerminalCounter;
036            private String optionString;
037            private SortedMap<String,ColumnDescription> inputColumns;
038            private SortedMap<String,ColumnDescription> edgeLabelColumns;
039            private SortedMap<String,ColumnDescription> phraseLabelColumns;
040            
041            private String fileName = null;
042            private URL url = null;
043            private String charsetName;
044            private int nIterations;
045            private int cIterations;
046            private boolean closeStream = true;
047            
048            private char STARTING_BRACKET = '(';
049            private char CLOSING_BRACKET = ')';
050            private char INPUT_SEPARATOR = ' ';
051            private char EDGELABEL_SEPARATOR = '-';
052            private char SENTENCE_SEPARATOR = '\n';
053            private char BLANK = ' ';
054            private char CARRIAGE_RETURN = '\r';
055            private char TAB = '\t';
056            
057            public BracketReader() { 
058                    input = new StringBuilder();
059                    nIterations = 1;
060                    cIterations = 1;
061            }
062            
063            private void reopen() throws MaltChainedException {
064                    close();
065                    if (fileName != null) {
066                            open(fileName, charsetName);
067                    } else if (url != null) {
068                            open(url, charsetName);
069                    } else {
070                            throw new DataFormatException("The input stream cannot be reopen. ");
071                    }
072            }
073            
074            public void open(String fileName, String charsetName) throws MaltChainedException {
075                    setFileName(fileName);
076                    setCharsetName(charsetName);
077                    try {
078                            open(new FileInputStream(fileName), charsetName);
079                    }catch (FileNotFoundException e) {
080                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081                    }
082            }
083            public void open(URL url, String charsetName) throws MaltChainedException {
084                    setUrl(url);
085                    setCharsetName(charsetName);
086                    try {
087                            open(url.openStream(), charsetName);
088                    } catch (IOException e) {
089                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090                    }
091            }
092            
093            public void open(InputStream is, String charsetName) throws MaltChainedException {
094                    try {
095                            if (is == System.in) {
096                                    closeStream = false;
097                            }
098                            open(new InputStreamReader(is, charsetName));
099                    } catch (UnsupportedEncodingException e) {
100                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101                    }
102            }
103            
104            private void open(InputStreamReader isr) throws MaltChainedException {
105                    setReader(new BufferedReader(isr));
106                    setSentenceCount(0);
107            }
108            
109            public void readProlog() throws MaltChainedException {
110                    
111            }
112            
113            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
114                    if (syntaxGraph == null || dataFormatInstance == null) {
115                            return false;
116                    }
117                    syntaxGraph.clear();
118                    int brackets = 0;
119                    try {
120                            int l = reader.read();
121                            char c;
122                            input.setLength(0);
123                    
124                            while (true) {
125                                    if (l == -1) {
126                                            input.setLength(0);
127                                            return false;
128                                    }
129                                    
130                                    c = (char)l; 
131                                    l = reader.read();
132    
133                                    if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
134    
135                                    } else if (c == STARTING_BRACKET) {
136                                            input.append(c);
137                                            brackets++;
138                                    } else if (c == CLOSING_BRACKET) {
139                                            input.append(c);
140                                            brackets--;
141                                    } else if (c == INPUT_SEPARATOR) {
142                                            if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
143                                                    input.append(c);
144                                            }
145                                    // Start BracketProgLangReader
146                                    } else if (c == '\\') {
147                                            c = (char) l;
148                                            l = reader.read();
149                                            if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') {
150                                                    System.out.println("Error");
151                                                    System.exit(1);
152                                            } else {
153                                                    input.append("\\" + c);
154                                            }
155                                    // End BracketProgLangReader
156                                    } else if (brackets != 0){
157                                            input.append(c);
158                                    }
159                                    if (brackets == 0 && input.length() != 0) {
160                                            sentenceCount++;
161                                            terminalCounter = 1; 
162                                            nonTerminalCounter = 1;
163                                            if (syntaxGraph instanceof PhraseStructure) {
164                                                    bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
165                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
166                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
167                                                    }
168                                            }
169                                            return true;
170                                    }
171                                    
172                                    if (c == -1) {
173                                            if (brackets != 0) {
174                                                    close();
175                                                    throw new MaltChainedException("Error when reading from the input file. ");
176                                            }
177                                            if (cIterations < nIterations) {
178                                                    cIterations++;
179                                                    reopen();
180                                                    return true;
181                                            }
182                                            return false;
183                                    }
184                            }
185                    }  catch (IOException e) {
186                            close();
187                            throw new MaltChainedException("Error when reading from the input file. ", e);
188                    } 
189                    
190            }
191                    
192            private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
193                    int bracketsdepth = 0;
194                    int startpos = start-1;
195                    for (int i = start, n = end; i < n; i++) {
196                            if (input.charAt(i) == STARTING_BRACKET
197                                            // Start BracketProgLangReader
198                                            && (i == 0 || input.charAt(i - 1) != '\\')
199                                            // end BracketProgLangReader
200                            
201                            ) {
202                                    if (bracketsdepth == 0) {
203                                            startpos = i;
204                                    }
205                                    bracketsdepth++;
206                            } else if (input.charAt(i) == CLOSING_BRACKET
207                                            // Start BracketProgLangReader
208                                            && (i == 0 || input.charAt(i - 1) != '\\')
209                                            // end BracketProgLangReader
210                            ) {
211                                    bracketsdepth--;
212                                    if (bracketsdepth == 0) {
213                                            extract(phraseStructure, startpos+1, i, parent);
214                                    }       
215                            }
216                    }
217            }
218    
219            private void extract(PhraseStructure phraseStructure, int begin, int end,  PhraseStructureNode parent) throws MaltChainedException {
220                    int index = -1;
221                    for (int i = begin; i < end; i++) {
222                            if (input.charAt(i) == STARTING_BRACKET
223                                            // Start BracketProgLangReader
224                                            && (i == begin || input.charAt(i - 1) != '\\')
225                                            // end BracketProgLangReader            
226                            ) {
227                                    index = i;
228                                    break;
229                            }
230                    }
231                    if (index == -1) {
232                            TokenNode t = phraseStructure.addTokenNode(terminalCounter);
233                            if (t == null) {
234                                    close();
235                                    throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
236                            }
237    
238                            terminalCounter++;
239                            Edge e = null;
240    
241                            if (parent != null) {
242                                    e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
243                            } else {
244                                    close();
245                                    throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
246                            }
247    
248                            int start = begin;
249    
250                            Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
251                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
252                            boolean noneNode = false;
253                            boolean edgeLabels = false;
254                            for (int i = begin; i < end; i++) {
255                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 
256                                                    // Start BracketProgLangReader
257                                                    && (i == begin || input.charAt(i - 1) != '\\')
258                                                    // end BracketProgLangReader    
259                                            ) || i == end - 1) {
260                                            if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
261                                                    noneNode = true;
262                                            } else if (start == begin) {
263                                                    if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
264                                                            if (inputColumnsIterator.hasNext()) { 
265                                                                    t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), 
266                                                                                    
267                                                                                    // Start BracketProgLangReader
268                                                                                    decodeString(
269                                                                                    // end BracketProgLangReader
270                                                                                    (i == end - 1)?input.substring(start,end):input.substring(start, i)
271                                                                                    // Start BracketProgLangReader
272                                                                                    )
273                                                                                    // end BracketProgLangReader            
274                                                                                    );
275                                                            }
276                                                            start = i + 1;
277                                                            if (input.charAt(i) == EDGELABEL_SEPARATOR) {
278                                                                    edgeLabels = true;
279                                                            }
280                                                    }
281                                            } else if (edgeLabels && e != null) {
282                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
283                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
284                                                    }
285                                                    start = i + 1;
286                                                    if (input.charAt(i) == INPUT_SEPARATOR
287                                                                    // Start BracketProgLangReader
288                                                                    && (i == begin || input.charAt(i - 1) != '\\')
289                                                                    // end BracketProgLangReader            
290                                                    ) {
291                                                            edgeLabels = false;
292                                                    }
293                                            } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR
294                                                            // Start BracketProgLangReader
295                                                            && (i == begin || input.charAt(i - 1) != '\\')
296                                                            // end BracketProgLangReader
297                                                            )
298                                            ) {     
299                                            } else {
300                                                    if (inputColumnsIterator.hasNext()) { 
301                                                            t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
302                                                    }
303                                                    start = i + 1;
304                                            }
305                                    }
306                            }
307                    } else {
308                            PhraseStructureNode nt;
309                            Edge e = null;
310                            if (parent == null) {
311                                    nt = phraseStructure.getPhraseStructureRoot();
312                            } else {
313                                    nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
314                                    if (nt == null) {
315                                            close();
316                                            throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
317                                    } 
318                                    nonTerminalCounter++;
319    
320                                    e = phraseStructure.addPhraseStructureEdge(parent, nt);
321                            }
322                            Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
323                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
324                            int newbegin = begin;
325                            int start = begin;
326                            
327                            for (int i = begin; i < index; i++) {
328                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
329                                            if (start == newbegin) {
330                                                    if (phraseLabelColumnsIterator.hasNext()) { 
331                                                            nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
332                                                    }
333                                                    start = i + 1;
334                                            } else if (e != null) {
335                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
336                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
337                                                    }
338                                                    start = i + 1;
339                                            }
340                                    } else if (input.charAt(i) == BLANK) {
341                                            start++;
342                                            newbegin++;
343                                    }
344                            }
345    
346                            bracketing(phraseStructure, index, end, nt);
347                    }
348            }
349            
350            private String decodeString(String string) {
351                    return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ");
352            }
353            
354            public void readEpilog() throws MaltChainedException {
355                    
356            }
357            
358            public BufferedReader getReader() {
359                    return reader;
360            }
361    
362            public void setReader(BufferedReader reader) {
363                    this.reader = reader;
364            }
365            
366            public int getSentenceCount() throws MaltChainedException {
367                    return sentenceCount;
368            }
369            
370            public void setSentenceCount(int sentenceCount) {
371                    this.sentenceCount = sentenceCount;
372            }
373            
374            public DataFormatInstance getDataFormatInstance() {
375                    return dataFormatInstance;
376            }
377            
378            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
379                    this.dataFormatInstance = inputDataFormatInstance;
380                    inputColumns = dataFormatInstance.getInputColumnDescriptions();
381                    edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
382                    phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
383            }
384            
385            public String getOptions() {
386                    return optionString;
387            }
388            
389            public void setOptions(String optionString) throws MaltChainedException {
390                    this.optionString = optionString;
391            }
392            
393            public String getFileName() {
394                    return fileName;
395            }
396    
397            public void setFileName(String fileName) {
398                    this.fileName = fileName;
399            }
400    
401            public URL getUrl() {
402                    return url;
403            }
404    
405            public void setUrl(URL url) {
406                    this.url = url;
407            }
408    
409            public String getCharsetName() {
410                    return charsetName;
411            }
412    
413            public void setCharsetName(String charsetName) {
414                    this.charsetName = charsetName;
415            }
416    
417            public int getNIterations() {
418                    return nIterations;
419            }
420    
421            public void setNIterations(int iterations) {
422                    nIterations = iterations;
423            }
424    
425            public int getIterationCounter() {
426                    return cIterations;
427            }
428            
429            public void close() throws MaltChainedException {
430                    try {
431                            if (reader != null) {
432                                    if (closeStream) {
433                                            reader.close();
434                                    }
435                                    reader = null;
436                            }
437                    }   catch (IOException e) {
438                            throw new DataFormatException("Error when closing the input file.", e);
439                    } 
440            }
441    }