001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.SortedMap;
011    import java.util.regex.PatternSyntaxException;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.io.dataformat.ColumnDescription;
015    import org.maltparser.core.io.dataformat.DataFormatException;
016    import org.maltparser.core.io.dataformat.DataFormatInstance;
017    import org.maltparser.core.symbol.SymbolTable;
018    import org.maltparser.core.syntaxgraph.PhraseStructure;
019    import org.maltparser.core.syntaxgraph.TokenStructure;
020    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
021    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
022    import org.maltparser.core.syntaxgraph.node.TokenNode;
023    import org.maltparser.ml.libsvm.LibsvmException;
024    /**
025    *
026    *
027    * @author Johan Hall
028    */
029    public class BracketWriter implements SyntaxGraphWriter {
030            private enum PennWriterFormat {
031                    DEFAULT, PRETTY
032            };
033            private PennWriterFormat format;
034            private BufferedWriter writer;
035            private DataFormatInstance dataFormatInstance;
036            private SortedMap<String,ColumnDescription> inputColumns;
037            private SortedMap<String,ColumnDescription> edgeLabelColumns;
038            private SortedMap<String,ColumnDescription> phraseLabelColumns;
039            private char STARTING_BRACKET = '(';
040            private String EMPTY_EDGELABEL = "??";
041            private char CLOSING_BRACKET = ')';
042            private char INPUT_SEPARATOR = ' ';
043            private char EDGELABEL_SEPARATOR = '-';
044            private char SENTENCE_SEPARATOR = '\n';
045            private String optionString;
046            private boolean closeStream = true;
047            
048            public BracketWriter() { 
049            }
050    
051            public void open(String fileName, String charsetName) throws MaltChainedException {
052                    try {
053                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
054                    } catch (FileNotFoundException e) {
055                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
056                    } catch (UnsupportedEncodingException e) {
057                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
058                    }       
059            }
060            
061            public void open(OutputStream os, String charsetName) throws MaltChainedException {
062                    try {
063                            if (os == System.out || os == System.err) {
064                                    closeStream = false;
065                            }
066                            open(new OutputStreamWriter(os, charsetName));
067                    } catch (UnsupportedEncodingException e) {
068                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
069                    }
070            }
071            
072            private void open(OutputStreamWriter osw) throws MaltChainedException {
073                    setWriter(new BufferedWriter(osw));
074            }
075    
076            public void writeEpilog() throws MaltChainedException {
077    
078            }
079            
080            public void writeProlog() throws MaltChainedException {
081            
082            }
083            
084            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
085                    if (syntaxGraph == null || dataFormatInstance == null) {
086                            return;
087                    }
088                    if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) {
089    //                      PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph);
090                            if (format == PennWriterFormat.PRETTY) {
091                                    writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0);
092                            } else {
093                                    writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot());
094                            }
095                            try {
096                                    writer.write(SENTENCE_SEPARATOR);
097                                    writer.flush();
098                            } catch (IOException e) {
099                                    close();
100                                    throw new DataFormatException("Could not write to the output file. ", e);
101                            }
102                    }
103            }
104            
105            private void writeElement(PhraseStructureNode element) throws MaltChainedException {
106                    try {
107                            if (element instanceof TokenNode) {
108                                    PhraseStructureNode t = (PhraseStructureNode)element;
109                                    SymbolTable table = null;
110                                    writer.write(STARTING_BRACKET);
111                                    int i = 0;
112                                    for (String inputColumn : inputColumns.keySet()) {
113                                            if (i != 0) {
114                                                    writer.write(INPUT_SEPARATOR);
115                                            }
116                                            table = inputColumns.get(inputColumn).getSymbolTable();
117                                            if (t.hasLabel(table)) {
118                                                    writer.write(t.getLabelSymbol(table));
119                                            }
120                                            if (i == 0) {
121                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
122                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
123                                                            if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
124                                                                    writer.write(EDGELABEL_SEPARATOR);
125                                                                    writer.write(t.getParentEdgeLabelSymbol(table));
126                                                            }
127                                                    }
128                                            }
129                                            i++;
130                                    }
131                                    writer.write(CLOSING_BRACKET);
132                            } else {
133                                    NonTerminalNode nt = (NonTerminalNode)element;
134                                    writer.write(STARTING_BRACKET);
135                                    SymbolTable table = null;
136                                    int i = 0;
137                                    for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
138                                            if (i != 0) {
139                                                    writer.write(INPUT_SEPARATOR);
140                                            }
141                                            table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
142                                            if (nt.hasLabel(table)) { 
143                                                    writer.write(nt.getLabelSymbol(table));
144                                            }
145                                            if (i == 0) {
146                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
147                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
148                                                            if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
149                                                                    writer.write(EDGELABEL_SEPARATOR);
150                                                                    writer.write(nt.getParentEdgeLabelSymbol(table));
151                                                            }
152                                                    }
153                                            }
154                                            i++;
155                                    }
156                                    for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
157                                            writeElement(node);
158                                    }
159                                    writer.write(CLOSING_BRACKET);
160                            }
161                    } catch (IOException e) {
162                            throw new DataFormatException("Could not write to the output file. ", e);
163                    }
164            }
165            
166            private String getIndentation(int depth) {
167                    StringBuilder sb = new StringBuilder("");
168                    for (int i = 0; i < depth; i++) {
169                            sb.append("\t");
170                    }
171                    return sb.toString();
172            }
173            
174            private void writeElement(PhraseStructureNode element, int depth) throws MaltChainedException {
175                    try {
176                            if (element instanceof TokenNode) {
177                                    PhraseStructureNode t = (PhraseStructureNode)element;
178                                    SymbolTable table = null;
179                                    writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
180                                    int i = 0;
181                                    for (String inputColumn : inputColumns.keySet()) {
182                                            if (i != 0) {
183                                                    writer.write(INPUT_SEPARATOR);
184                                            }
185                                            table = inputColumns.get(inputColumn).getSymbolTable();
186                                            if (t.hasLabel(table)) {
187                                                    writer.write(encodeString(t.getLabelSymbol(table)));
188                                            }
189                                            if (i == 0) {
190                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
191                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
192                                                            if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
193                                                                    writer.write(EDGELABEL_SEPARATOR);
194                                                                    writer.write(t.getParentEdgeLabelSymbol(table));
195                                                            }
196                                                    }
197                                            }
198                                            i++;
199                                    }
200                                    writer.write(CLOSING_BRACKET);
201                            } else {
202                                    NonTerminalNode nt = (NonTerminalNode)element;
203                                    writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
204                                    SymbolTable table = null;
205                                    int i = 0;
206                                    for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
207                                            if (i != 0) {
208                                                    writer.write(INPUT_SEPARATOR);
209                                            }
210                                            table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
211                                            if (nt.hasLabel(table)) { 
212                                                    writer.write(nt.getLabelSymbol(table));
213                                            }
214                                            if (i == 0) {
215                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
216                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
217                                                            if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
218                                                                    writer.write(EDGELABEL_SEPARATOR);
219                                                                    writer.write(nt.getParentEdgeLabelSymbol(table));
220                                                            }
221                                                    }
222                                            }
223                                            i++;
224                                    }
225                                    for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
226                                            writeElement(node, depth + 1);
227                                    }
228                                    writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET);
229                            }
230                    } catch (IOException e) {
231                            throw new DataFormatException("Could not write to the output file. ", e);
232                    }
233            }
234            
235            public BufferedWriter getWriter() {
236                    return writer;
237            }
238    
239            public void setWriter(BufferedWriter writer) throws MaltChainedException {
240                    close();
241                    this.writer = writer;
242            }
243            
244            public DataFormatInstance getDataFormatInstance() {
245                    return dataFormatInstance;
246            }
247    
248            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
249                    this.dataFormatInstance = dataFormatInstance;
250                    inputColumns = dataFormatInstance.getInputColumnDescriptions();
251                    edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
252                    phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
253            }
254    
255            public String getOptions() {
256                    return optionString;
257            }
258            
259            public void setOptions(String optionString) throws MaltChainedException {
260                    this.optionString = optionString;
261                    format = PennWriterFormat.DEFAULT;
262    
263                    String[] argv;
264                    try {
265                            argv = optionString.split("[_\\p{Blank}]");
266                    } catch (PatternSyntaxException e) {
267                            throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e);
268                    }
269                    for (int i=0; i < argv.length-1; i++) {
270                            if(argv[i].charAt(0) != '-') {
271                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
272                            }
273                            if(++i>=argv.length) {
274                                    throw new DataFormatException("The last argument does not have any value. ");
275                            }
276                            switch(argv[i-1].charAt(1)) {
277                            case 'f': 
278                                    if (argv[i].equals("p")) {
279                                            format = PennWriterFormat.PRETTY;
280                                    } else if (argv[i].equals("p")) {
281                                            format = PennWriterFormat.DEFAULT;
282                                    }
283                                    break;
284                            default:
285                                    throw new LibsvmException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");         
286                            }
287                    }       
288            }
289            
290            public void close() throws MaltChainedException {
291                    try {
292                            if (writer != null) {
293                                    writer.flush();
294                                    if (closeStream) {
295                                            writer.close();
296                                    }
297                                    writer = null;
298                            }
299                    }   catch (IOException e) {
300                            throw new DataFormatException("Could not close the output file. ", e);
301                    } 
302            }
303            
304            private String encodeString(String string) {
305                    return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-");
306            }
307    }