001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.SortedMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import javax.xml.stream.XMLInputFactory;
015    import javax.xml.stream.XMLStreamConstants;
016    import javax.xml.stream.XMLStreamException;
017    import javax.xml.stream.XMLStreamReader;
018    
019    import org.maltparser.core.exception.MaltChainedException;
020    import org.maltparser.core.io.dataformat.DataFormatException;
021    import org.maltparser.core.io.dataformat.DataFormatInstance;
022    import org.maltparser.core.symbol.SymbolTable;
023    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024    import org.maltparser.core.syntaxgraph.PhraseStructure;
025    import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026    import org.maltparser.core.syntaxgraph.TokenStructure;
027    import org.maltparser.core.syntaxgraph.edge.Edge;
028    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030    
031    /**
032    *
033    *
034    * @author Johan Hall
035    */
036    public class TigerXMLReader implements SyntaxGraphReader {
037    //      private TigerXMLHeader header;
038            private XMLStreamReader reader;
039            private int sentenceCount;
040            private DataFormatInstance dataFormatInstance;
041            private StringBuffer ntid;
042            private final StringBuilder graphRootID;
043    //      private StringBuilder elementContent; 
044    //      private StringBuilder valueName;
045    //      private StringBuilder currentFeatureName;
046    //      private Domain domain;
047    //      private boolean collectChar = false;
048            private String optionString;
049            private String fileName = null;
050            private URL url = null;
051            private String charsetName;
052            private int nIterations;
053            private int cIterations;
054            private int START_ID_OF_NONTERMINALS = 500;
055            private boolean closeStream = true;
056            
057            public TigerXMLReader() {
058                    this.ntid = new StringBuffer();
059    //              elementContent = new StringBuilder();
060    //              valueName = new StringBuilder();
061    //              currentFeatureName = new StringBuilder(); 
062                    graphRootID = new StringBuilder(); 
063                    nIterations = 1;
064                    cIterations = 1;
065            }
066            
067            private void reopen() throws MaltChainedException {
068                    close();
069                    if (fileName != null) {
070                            open(fileName, charsetName);
071                    } else if (url != null) {
072                            open(url, charsetName);
073                    } else {
074                            throw new DataFormatException("The input stream cannot be reopen. ");
075                    }
076            }
077            
078            public void open(String fileName, String charsetName) throws MaltChainedException {
079                    setFileName(fileName);
080                    setCharsetName(charsetName);
081                    try {
082                            open(new FileInputStream(fileName), charsetName);
083                    }catch (FileNotFoundException e) {
084                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
085                    }
086            }
087            public void open(URL url, String charsetName) throws MaltChainedException {
088                    setUrl(url);
089                    setCharsetName(charsetName);
090                    try {
091                            open(url.openStream(), charsetName);
092                    } catch (IOException e) {
093                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
094                    }
095            }
096            
097            public void open(InputStream is, String charsetName) throws MaltChainedException {
098                    try {
099                            if (is == System.in) {
100                                    closeStream = false;
101                            }
102                            open(new InputStreamReader(is, charsetName));
103                    } catch (UnsupportedEncodingException e) {
104                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
105                    }
106            }
107            
108            private void open(InputStreamReader isr) throws MaltChainedException {
109                    try {
110                            XMLInputFactory factory = XMLInputFactory.newInstance();
111                            setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
112                    } catch (XMLStreamException e) {
113                            throw new DataFormatException("XML input file could be opened. ", e);
114                    } 
115                    setSentenceCount(0);
116            }
117            
118            public void readProlog() throws MaltChainedException {
119                    
120            }
121            
122            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
123                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
124                            return false;
125                    }
126                    syntaxGraph.clear();
127                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
128                    PhraseStructureNode parent = null;
129                    PhraseStructureNode child = null;
130    //              if (header == null) {
131    //                      header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
132    //              }
133    
134                    try {
135                            while (true) {
136                                    int event = reader.next();
137                                    if (event == XMLStreamConstants.START_ELEMENT) {
138                                            if (reader.getLocalName().length() == 0) {
139                                                    continue;
140                                            }
141                                            if (reader.getLocalName().charAt(0) == 'e') {
142                                                    // e -> edge, edgelabel
143                                                    if (reader.getLocalName().length() == 4) { //edge
144                                                            int childid = -1;
145                                                            int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
146                                                            
147                                                            try {
148                                                                    if (indexSep != -1) {
149                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
150                                                                    } else {
151                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
152                                                                    }
153                                                                    if (childid == -1) {
154                                                                            throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
155                                                                    }
156                                                            } catch (NumberFormatException e) {
157                                                                    throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
158                                                            }
159    
160                                                            if (childid < START_ID_OF_NONTERMINALS) {
161                                                                    child = phraseStructure.getTokenNode(childid);
162                                                            } else {
163    
164                                                                    child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
165                                                            }
166    
167                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
168                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
169                                                            for (String name : inputTables.keySet()) {
170                                                                    e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
171                                                            }
172                                                    } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
173    //                                                      domain = Domain.EL;
174                                                    }
175                                            } else if (reader.getLocalName().charAt(0) == 'n') {
176                                                    // n -> nt, nonterminals, name
177                                                    if (reader.getLocalName().length() == 2) { // nt
178                                                            final String id = reader.getAttributeValue(null, "id");
179                                                            if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
180                                                                    parent = phraseStructure.getPhraseStructureRoot();
181                                                            } else {
182                                                                    int index = id.indexOf('_');
183                                                                    if (index != -1) {
184                                                                            parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
185                                                                    }
186                                                            }
187                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
188                                                            for (String name : inputTables.keySet()) {
189                                                                    parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
190                                                            }
191                                                    } else if (reader.getLocalName().equals("name")) { // name
192    //                                                      elementContent.setLength(0);
193    //                                                      collectChar = true;
194                                                    }
195                                            } else if (reader.getLocalName().charAt(0) == 't') {
196                                                    // t -> t, terminals
197                                                    if (reader.getLocalName().length() == 1) { // t
198                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
199                                                            child = syntaxGraph.addTokenNode();
200                                                            for (String name : inputTables.keySet()) {
201                                                                    child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
202                                                            }
203                                                    }
204                                            } else if (reader.getLocalName().charAt(0) == 's') {
205                                                    // s -> subcorpus, secedge, s, secedgelabel
206                                                    if (reader.getLocalName().length() == 1) { // s
207                                                            String id = reader.getAttributeValue(null, "id");
208                                                            boolean indexable = false;
209                                                            int index = -1;
210                                                            if (id != null && id.length() > 0) {
211                                                                    for (int i = 0, n = id.length(); i < n; i++) {
212                                                                            if (Character.isDigit(id.charAt(i))) {
213                                                                                    if (index == -1) { 
214                                                                                            index = i;
215                                                                                    }
216                                                                                    indexable = true;
217                                                                            }
218                                                                    }
219                                                            }
220                                                            if (indexable) {
221                                                                    phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
222                                                            } else {
223                                                                    phraseStructure.setSentenceID(sentenceCount+1);
224                                                            }
225                                                    }
226                                            } else if (reader.getLocalName().charAt(0) == 'v') {
227                                                    // v -> variable, value
228    //                                              if (reader.getLocalName().equals("value")) {
229    //                                                      valueName.setLength(0);
230    //                                                      valueName.append(reader.getAttributeValue(null, "name"));
231    //                                                      elementContent.setLength(0);
232    //                                                      collectChar = true;
233    //                                              }
234                                            } else {
235    //                                               a -> annotation, author
236    //                                               b -> body
237    //                                               c -> corpus
238    //                                               d -> date, description,
239    //                                               f -> feature, format
240    //                                               g -> graph
241    //                                               h -> head, history
242    //                                               m -> matches, match
243                                                    if (reader.getLocalName().equals("graph")) {
244                                                            graphRootID.setLength(0);
245                                                            graphRootID.append(reader.getAttributeValue(null, "root"));
246                                                    } else  if (reader.getLocalName().equals("corpus")) {
247    //                                                      header.setCorpusID(reader.getAttributeValue(null, "id"));
248    //                                                      header.setCorpusID(reader.getAttributeValue(null, "version"));
249                                                    } else if (reader.getLocalName().equals("feature")) {
250    //                                                      if (header != null) {
251    //                                                              currentFeatureName.setLength(0);
252    //                                                              currentFeatureName.append(reader.getAttributeValue(null, "name"));
253    //                                                              header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
254    //                                                      }
255    //                                                      domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
256                                                    } else if (reader.getLocalName().equals("secedgelabel")) {
257    //                                                      domain = Domain.SEL;
258                                                    } else if (reader.getLocalName().equals("author")) {
259    //                                                      elementContent.setLength(0);
260    //                                                      collectChar = true;
261                                                    } else if (reader.getLocalName().equals("date")) {
262    //                                                      elementContent.setLength(0);
263    //                                                      collectChar = true;
264                                                    } else if (reader.getLocalName().equals("description")) {
265    //                                                      elementContent.setLength(0);
266    //                                                      collectChar = true;
267                                                    } else if (reader.getLocalName().equals("format")) {
268    //                                                      elementContent.setLength(0);
269    //                                                      collectChar = true;
270                                                    } else if (reader.getLocalName().equals("history")) {
271    //                                                      elementContent.setLength(0);
272    //                                                      collectChar = true;
273                                                    } 
274                                            }
275                                    } else if (event == XMLStreamConstants.END_ELEMENT) {
276                                            if (reader.getLocalName().length() == 0) {
277                                                    continue;
278                                            }
279                                            if (reader.getLocalName().charAt(0) == 'e') {
280                                                    // e -> edge, edgelabel
281                                            } else if (reader.getLocalName().charAt(0) == 'n') {
282                                                    // n -> nt, nonterminals, name
283                                                    if (reader.getLocalName().equals("nt")) {
284                                                            ntid.setLength(0);
285                                                    }
286                                                    else if (reader.getLocalName().equals("nonterminals")) {
287                                                            if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
288                                                                    Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
289                                                                    SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
290                                                                    for (String name : inputTables.keySet()) {
291                                                                            e.addLabel(inputTables.get(name), "--");
292                                                                    }
293                                                            }
294                                                    }
295    //                                              else if (reader.getLocalName().equals("name")) {
296    //                                                      if (header != null) {
297    //                                                              header.setMetaName(elementContent.toString());
298    //                                                      }
299    //                                                      collectChar = false;
300    //                                              }
301                                            } else if (reader.getLocalName().charAt(0) == 't') {
302                                                    // t -> t, terminals
303                                            } else if (reader.getLocalName().charAt(0) == 's') {
304                                                    // s -> subcorpus, secedge, s, secedgelabel
305                                                    if (reader.getLocalName().equals("s")) {
306                                                            if (syntaxGraph.hasTokens()) {
307                                                                    sentenceCount++;
308                                                            }
309                                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
310                                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
311                                                            }
312                                                            return true;
313                                                    }
314                                            } else if (reader.getLocalName().charAt(0) == 'v') {
315                                                    // v -> variable, value
316    //                                              if (reader.getLocalName().equals("value")) {
317    //                                                      if (header != null) {
318    //                                                              if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
319    //                                                                      header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
320    //                                                              } else if (domain == Domain.EL) {
321    //                                                                      header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
322    //                                                              } else if (domain == Domain.SEL) {
323    //                                                                      header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
324    //                                                              }
325    //                                                      }
326    //                                                      collectChar = false;
327    //                                              }
328                                            } else {
329    //                                               a -> annotation, author
330    //                                               b -> body
331    //                                               c -> corpus
332    //                                               d -> date, description,
333    //                                               f -> feature, format
334    //                                               g -> graph
335    //                                               h -> head, history
336    //                                               m -> matches, match
337                                                    if (reader.getLocalName().equals("body")) {
338                                                            //sentence = dataStructures.getSentence();
339                                                            //phraseTree = dataStructures.getInPhraseTree();
340                                                            //sentence.clear();
341                                                            //phraseTree.clear();
342                                                            //dataStructures.setLastProcessObject(true);
343                                                    }  else if (reader.getLocalName().equals("author")) {
344    //                                                      if (header != null) {
345    //                                                              header.setMetaAuthor(elementContent.toString());
346    //                                                      }
347    //                                                      collectChar = false;
348                                                    } else if (reader.getLocalName().equals("date")) {
349    //                                                      if (header != null) {
350    //                                                              header.setMetaInDate(elementContent.toString());
351    //                                                      }
352    //                                                      collectChar = false;
353                                                    } else if (reader.getLocalName().equals("description")) {
354    //                                                      if (header != null) {
355    //                                                              header.setMetaDescription(elementContent.toString());
356    //                                                      }
357    //                                                      collectChar = false;
358                                                    } else if (reader.getLocalName().equals("format")) {
359    //                                                      if (header != null) {
360    //                                                              header.setMetaFormat(elementContent.toString());
361    //                                                      }
362    //                                                      collectChar = false;
363                                                    } else if (reader.getLocalName().equals("history")) {
364    //                                                      if (header != null) {
365    //                                                              header.setMetaHistory(elementContent.toString());
366    //                                                      }
367    //                                                      collectChar = false;
368                                                    } /* else if (reader.getLocalName().equals("annotation")) {
369                                                            if (header != null) {
370                                                                    System.out.println(header.toTigerXML());
371                                                            }
372                                                            collectChar = false;
373                                                    } */
374                                            }                               
375                                    } else if (event == XMLStreamConstants.END_DOCUMENT) {
376                                            if (syntaxGraph.hasTokens()) {
377                                                    sentenceCount++;
378                                            }
379                                            if (cIterations < nIterations) {
380                                                    cIterations++;
381                                                    reopen();
382                                                    return true;
383                                            }
384                                            return false;
385                                    } else if (event == XMLStreamConstants.CHARACTERS) {
386    //                                      if (collectChar) {
387    //                                              char[] ch = reader.getTextCharacters();
388    //                                              final int size = reader.getTextStart()+reader.getTextLength();
389    //                                              for (int i = reader.getTextStart(); i < size; i++) {
390    //                                                      elementContent.append(ch[i]);
391    //                                              }
392    //                                      }
393                                    }
394                            }
395                    } catch (XMLStreamException e) {
396                            throw new DataFormatException("", e);
397                    }
398            }
399            
400            public int getSentenceCount() {
401                    return sentenceCount;
402            }
403    
404            public void setSentenceCount(int sentenceCount) {
405                    this.sentenceCount = sentenceCount;
406            }
407            
408            public XMLStreamReader getReader() {
409                    return reader;
410            }
411    
412            public void setReader(XMLStreamReader reader) {
413                    this.reader = reader;
414            }
415            
416            public void readEpilog() throws MaltChainedException {
417                    
418            }
419            
420            public void close() throws MaltChainedException {
421                    try {
422                            if (reader != null) {
423                                    if (closeStream) {
424                                            reader.close();
425                                    }
426                                    reader = null;
427                            }
428                    } catch (XMLStreamException e) {
429                            throw new DataFormatException("The XML input file could be closed. ", e);
430                    }
431            }
432    
433            public DataFormatInstance getDataFormatInstance() {
434                    return dataFormatInstance;
435            }
436            
437            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
438                    this.dataFormatInstance = inputDataFormatInstance;
439            }
440            
441            public String getOptions() {
442                    return optionString;
443            }
444            
445            public void setOptions(String optionString) throws MaltChainedException {
446                    this.optionString = optionString;
447                    String[] argv;
448                    try {
449                            argv = optionString.split("[_\\p{Blank}]");
450                    } catch (PatternSyntaxException e) {
451                            throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
452                    }
453                    for (int i=0; i < argv.length-1; i++) {
454                            if(argv[i].charAt(0) != '-') {
455                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
456                            }
457                            if(++i>=argv.length) {
458                                    throw new DataFormatException("The last argument does not have any value. ");
459                            }
460                            switch(argv[i-1].charAt(1)) {
461                            case 's': 
462                                    try {
463                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
464                                    } catch (NumberFormatException e){
465                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
466                                    }
467                                    break;
468                            default:
469                                    throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");          
470                            }
471                    }
472            }
473            
474            public String getFileName() {
475                    return fileName;
476            }
477    
478            public void setFileName(String fileName) {
479                    this.fileName = fileName;
480            }
481    
482            public URL getUrl() {
483                    return url;
484            }
485    
486            public void setUrl(URL url) {
487                    this.url = url;
488            }
489    
490            public String getCharsetName() {
491                    return charsetName;
492            }
493    
494            public void setCharsetName(String charsetName) {
495                    this.charsetName = charsetName;
496            }
497    
498            public int getNIterations() {
499                    return nIterations;
500            }
501    
502            public void setNIterations(int iterations) {
503                    nIterations = iterations;
504            }
505    
506            public int getIterationCounter() {
507                    return cIterations;
508            }
509    //      public TigerXMLHeader getHeader() {
510    //              return header;
511    //      }
512    //      
513    //      public void setHeader(TigerXMLHeader header) {
514    //              this.header = header;
515    //      }
516    }