001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
021    import org.maltparser.core.syntaxgraph.PhraseStructure;
022    import org.maltparser.core.syntaxgraph.TokenStructure;
023    import org.maltparser.core.syntaxgraph.edge.Edge;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class NegraReader implements SyntaxGraphReader {
032            private enum NegraTables {
033                    ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
034            };
035            private BufferedReader reader;
036            private DataFormatInstance dataFormatInstance;
037            private int sentenceCount;
038            private String optionString;
039            private int formatVersion;
040            private NegraTables currentHeaderTable;
041            private int currentTerminalSize;
042            private int currentNonTerminalSize;
043            private SortedMap<Integer,PhraseStructureNode> nonterminals; 
044            private StringBuilder edgelabelSymbol;
045            private StringBuilder edgelabelTableName;
046            private int START_ID_OF_NONTERMINALS = 500;
047            private String fileName = null;
048            private URL url = null;
049            private String charsetName;
050            private int nIterations;
051            private int cIterations;
052            private boolean closeStream = true;
053            
054            public NegraReader() {
055                    currentHeaderTable = NegraTables.UNDEF;
056                    edgelabelSymbol = new StringBuilder();
057                    edgelabelTableName = new StringBuilder();
058                    nonterminals = new TreeMap<Integer,PhraseStructureNode>();
059                    nIterations = 1;
060                    cIterations = 1;
061            }
062            
063            private void reopen() throws MaltChainedException {
064                    close();
065                    if (fileName != null) {
066                            open(fileName, charsetName);
067                    } else if (url != null) {
068                            open(url, charsetName);
069                    } else {
070                            throw new DataFormatException("The input stream cannot be reopen. ");
071                    }
072            }
073            
074            public void open(String fileName, String charsetName) throws MaltChainedException {
075                    setFileName(fileName);
076                    setCharsetName(charsetName);
077                    try {
078                            open(new FileInputStream(fileName), charsetName);
079                    } catch (FileNotFoundException e) {
080                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081                    }
082            }
083            public void open(URL url, String charsetName) throws MaltChainedException {
084                    setUrl(url);
085                    setCharsetName(charsetName);
086                    try {
087                            open(url.openStream(), charsetName);
088                    } catch (IOException e) {
089                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090                    }
091            }
092            
093            public void open(InputStream is, String charsetName) throws MaltChainedException {
094                    try {
095                            if (is == System.in) {
096                                    closeStream = false;
097                            }
098                            open(new InputStreamReader(is, charsetName));
099                    } catch (UnsupportedEncodingException e) {
100                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101                    }
102            }
103            
104            private void open(InputStreamReader isr) throws MaltChainedException {
105                    setReader(new BufferedReader(isr));
106                    setSentenceCount(0);
107            }
108            
109            public void readProlog() throws MaltChainedException {
110                    
111            }
112            
113            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
114                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
115                            return false;
116                    }
117                    syntaxGraph.clear();
118                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
119                    PhraseStructureNode parent = null;
120                    PhraseStructureNode child = null;
121                    currentHeaderTable = NegraTables.UNDEF;
122                    String line = null;
123                    syntaxGraph.clear();
124                    nonterminals.clear();
125                    try {
126                            while (true) {
127                                    line = reader.readLine();
128                                    if (line == null) {
129                                            if (syntaxGraph.hasTokens()) {
130                                                    sentenceCount++;
131                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
132                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
133                                                    }
134                                            }
135                                            if (cIterations < nIterations) {
136                                                    cIterations++;
137                                                    reopen();
138                                                    return true;
139                                            }
140                                            return false;
141                                    } else if (line.startsWith("#EOS")) {
142                                            currentTerminalSize = 0;
143                                            currentNonTerminalSize = 0;
144                                            currentHeaderTable = NegraTables.UNDEF;
145                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
146                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
147                                            }
148                                            return true;
149                                    } else if (line.startsWith("#BOS")) {
150                                            currentHeaderTable = NegraTables.SENTENCE;
151                                            int s = -1, e = -1;
152                                            for (int i = 5, n = line.length(); i < n; i++) {
153                                                    if (Character.isDigit(line.charAt(i)) && s == -1) {
154                                                            s = i;
155                                                    }
156                                                    if (line.charAt(i) == ' ') {
157                                                            e = i;
158                                                            break;
159                                                    }
160                                            }
161                                            if (s != e && s != -1 && e != -1) {
162                                                    phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
163                                            }
164                                            sentenceCount++;
165                                    } else if (currentHeaderTable == NegraTables.SENTENCE) {
166                                            if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
167                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
168                                                    ColumnDescription column = null;
169                                                    currentNonTerminalSize++;
170                                                    char[] lineChars = line.toCharArray();
171                                                    int start = 0;
172                                                    int secedgecounter = 0;
173                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
174                                                            if (lineChars[i] == '\t' && start == i) {
175                                                                    start++;
176                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
177                                                                    if (columns.hasNext()) {
178                                                                            column = columns.next();
179                                                                    }
180                                                                    if (column.getPosition() == 0) {
181                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
182                                                                            child = nonterminals.get(index);
183                                                                            if (child == null) {
184                                                                                    if (index != 0) {
185                                                                                            child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
186                                                                                    }
187                                                                                    nonterminals.put(index,child);
188                                                                            }
189                                                                    } else if (column.getPosition() == 2 && child != null) {
190                                                                            syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
191                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 
192                                                                            edgelabelSymbol.setLength(0);
193                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
194                                                                            edgelabelTableName.setLength(0);
195                                                                            edgelabelTableName.append(column.getName());
196                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
197                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
198                                                                            parent = nonterminals.get(index);
199                                                                            if (parent == null) {
200                                                                                    if (index == 0) {
201                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
202                                                                                    } else {
203                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
204                                                                                    }
205                                                                                    nonterminals.put(index,parent);
206                                                                            }
207                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
208                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
209                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
210                                                                            if (secedgecounter % 2 == 0) {
211                                                                                    edgelabelSymbol.setLength(0);
212                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
213                                                                                    secedgecounter++;
214                                                                            } else {
215                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
216                                                                                    if (index == 0) {
217                                                                                            parent = phraseStructure.getPhraseStructureRoot();
218                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
219                                                                                            parent = phraseStructure.getTokenNode(index);
220                                                                                    } else {
221                                                                                            parent = nonterminals.get(index);
222                                                                                            if (parent == null) {
223                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
224                                                                                                    nonterminals.put(index,parent);
225                                                                                            }
226                                                                                    }
227                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
228                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
229                                                                                    secedgecounter++;
230                                                                            }
231                                                                    }
232                                                                    start = i + 1;
233                                                            }
234                                                    }
235                                            } else { // Terminal
236                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
237                                                    ColumnDescription column = null;
238                                                    
239                                                    currentTerminalSize++;
240                                                    child = syntaxGraph.addTokenNode(currentTerminalSize);
241                                                    char[] lineChars = line.toCharArray();
242                                                    int start = 0;
243                                                    int secedgecounter = 0;
244                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
245                                                            if (lineChars[i] == '\t' && start == i) {
246                                                                    start++;
247                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
248                                                                    if (columns.hasNext()) {
249                                                                            column = columns.next();
250                                                                    }
251                                                                    if (column.getCategory() == ColumnDescription.INPUT && child != null) {
252                                                                            syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
253                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
254                                                                            edgelabelSymbol.setLength(0);
255                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
256                                                                            edgelabelTableName.setLength(0);
257                                                                            edgelabelTableName.append(column.getName());
258                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
259                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
260                                                                            parent = nonterminals.get(index);
261                                                                            if (parent == null) {
262                                                                                    if (index == 0) {
263                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
264                                                                                    } else {
265                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
266                                                                                    }
267                                                                                    nonterminals.put(index,parent);
268                                                                            }
269    
270                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
271                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
272                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
273                                                                            if (secedgecounter % 2 == 0) {
274                                                                                    edgelabelSymbol.setLength(0);
275                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
276                                                                                    secedgecounter++;
277                                                                            } else {
278                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
279                                                                                    if (index == 0) {
280                                                                                            parent = phraseStructure.getPhraseStructureRoot();
281                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
282                                                                                            parent = phraseStructure.getTokenNode(index);
283                                                                                    } else {
284                                                                                            parent = nonterminals.get(index);
285                                                                                            if (parent == null) {
286                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
287                                                                                                    nonterminals.put(index,parent);
288                                                                                            }
289                                                                                    }
290                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
291                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
292                                                                                    secedgecounter++;
293                                                                            }
294                                                                    }
295                                                                    start = i + 1;
296                                                            }
297                                                    }
298                                            }
299                                    } else if (line.startsWith("%%")) { // comment skip
300                                    
301                                    } else if (line.startsWith("#FORMAT")) {
302    //                              int index = line.indexOf(' ');
303    //                              if (index > -1) {
304    //                                      try {
305    //                                              formatVersion = Integer.parseInt(line.substring(index+1));
306    //                                      } catch (NumberFormatException e) {
307    //                                              
308    //                                      }
309    //                              }
310                                    } else if (line.startsWith("#BOT")) {
311    //                              int index = line.indexOf(' ');
312    //                              if (index > -1) {
313    //                                      if (line.substring(index+1).equals("ORIGIN")) {
314    //                                              currentHeaderTable = NegraTables.ORIGIN;
315    //                                      } else if (line.substring(index+1).equals("EDITOR")) {
316    //                                              currentHeaderTable = NegraTables.EDITOR;
317    //                                      } else if (line.substring(index+1).equals("WORDTAG")) {
318    //                                              currentHeaderTable = NegraTables.WORDTAG;
319    //                                      } else if (line.substring(index+1).equals("MORPHTAG")) {
320    //                                              currentHeaderTable = NegraTables.MORPHTAG;
321    //                                      } else if (line.substring(index+1).equals("NODETAG")) {
322    //                                              currentHeaderTable = NegraTables.NODETAG;
323    //                                      } else if (line.substring(index+1).equals("EDGETAG")) {
324    //                                              currentHeaderTable = NegraTables.EDGETAG;
325    //                                      } else if (line.substring(index+1).equals("SECEDGETAG")) {
326    //                                              currentHeaderTable = NegraTables.SECEDGETAG;
327    //                                      } else {
328    //                                              currentHeaderTable = NegraTables.UNDEF;
329    //                                      }
330    //                              }
331                                    } else if (line.startsWith("#EOT")) {
332                                            currentHeaderTable = NegraTables.UNDEF;
333                                    }
334                            }
335                    }  catch (IOException e) {
336                            throw new DataFormatException("Error when reading from the input file. ", e);
337                    }
338            }
339            
340            public void readEpilog() throws MaltChainedException {
341                    
342            }
343            
344            public BufferedReader getReader() {
345                    return reader;
346            }
347    
348            public void setReader(BufferedReader reader) {
349                    this.reader = reader;
350            }
351    
352            public int getSentenceCount() {
353                    return sentenceCount;
354            }
355    
356            public void setSentenceCount(int sentenceCount) {
357                    this.sentenceCount = sentenceCount;
358            }
359            
360            public int getFormatVersion() {
361                    return formatVersion;
362            }
363    
364            public void setFormatVersion(int formatVersion) {
365                    this.formatVersion = formatVersion;
366            }
367    
368            public DataFormatInstance getDataFormatInstance() {
369                    return dataFormatInstance;
370            }
371            
372            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
373                    this.dataFormatInstance = inputDataFormatInstance;
374            }
375            
376            public String getOptions() {
377                    return optionString;
378            }
379            
380            public void setOptions(String optionString) throws MaltChainedException {
381                    this.optionString = optionString;
382    
383                    String[] argv;
384                    try {
385                            argv = optionString.split("[_\\p{Blank}]");
386                    } catch (PatternSyntaxException e) {
387                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
388                    }
389                    for (int i=0; i < argv.length-1; i++) {
390                            if(argv[i].charAt(0) != '-') {
391                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
392                            }
393                            if(++i>=argv.length) {
394                                    throw new DataFormatException("The last argument does not have any value. ");
395                            }
396                            switch(argv[i-1].charAt(1)) {
397                            case 's': 
398                                    try {
399                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
400                                    } catch (NumberFormatException e){
401                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
402                                    }
403                                    break;
404                            default:
405                                    throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
406                            }
407                    }
408            }
409            
410            public String getFileName() {
411                    return fileName;
412            }
413    
414            public void setFileName(String fileName) {
415                    this.fileName = fileName;
416            }
417    
418            public URL getUrl() {
419                    return url;
420            }
421    
422            public void setUrl(URL url) {
423                    this.url = url;
424            }
425    
426            public String getCharsetName() {
427                    return charsetName;
428            }
429    
430            public void setCharsetName(String charsetName) {
431                    this.charsetName = charsetName;
432            }
433    
434            public int getNIterations() {
435                    return nIterations;
436            }
437    
438            public void setNIterations(int iterations) {
439                    nIterations = iterations;
440            }
441    
442            public int getIterationCounter() {
443                    return cIterations;
444            }
445            
446            public void close() throws MaltChainedException {
447                    try {
448                            if (reader != null) {
449                                    if (closeStream) {
450                                            reader.close();
451                                    }
452                                    reader = null;
453                            }
454                    }   catch (IOException e) {
455                            throw new DataFormatException("Error when closing the input file.", e);
456                    } 
457            }
458    }