1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.fileupload.util.mime;
18
19 import java.io.ByteArrayOutputStream;
20 import java.io.UnsupportedEncodingException;
21 import java.util.Base64;
22 import java.util.HashMap;
23 import java.util.Locale;
24 import java.util.Map;
25
26 /**
27 * Utility class to decode MIME texts.
28 *
29 * @since 1.3
30 */
31 public final class MimeUtility {
32
33 /**
34 * The {@code US-ASCII} charset identifier constant.
35 */
36 private static final String US_ASCII_CHARSET = "US-ASCII";
37
38 /**
39 * The marker to indicate text is encoded with BASE64 algorithm.
40 */
41 private static final String BASE64_ENCODING_MARKER = "B";
42
43 /**
44 * The marker to indicate text is encoded with QuotedPrintable algorithm.
45 */
46 private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
47
48 /**
49 * If the text contains any encoded tokens, those tokens will be marked with "=?".
50 */
51 private static final String ENCODED_TOKEN_MARKER = "=?";
52
53 /**
54 * If the text contains any encoded tokens, those tokens will terminate with "=?".
55 */
56 private static final String ENCODED_TOKEN_FINISHER = "?=";
57
58 /**
59 * The linear whitespace chars sequence.
60 */
61 private static final String LINEAR_WHITESPACE = " \t\r\n";
62
63 /**
64 * Mappings between MIME and Java charset.
65 */
66 private static final Map<String, String> MIME2JAVA = new HashMap<>();
67
68 static {
69 MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
70 MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
71 MIME2JAVA.put("utf-8", "UTF8");
72 MIME2JAVA.put("utf8", "UTF8");
73 MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
74 MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
75 MIME2JAVA.put("euc-kr", "KSC5601");
76 MIME2JAVA.put("euckr", "KSC5601");
77 MIME2JAVA.put("us-ascii", "ISO-8859-1");
78 MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
79 }
80
81 /**
82 * Decode a string of text obtained from a mail header into
83 * its proper form. The text generally will consist of a
84 * string of tokens, some of which may be encoded using
85 * base64 encoding.
86 *
87 * @param text The text to decode.
88 * @return The decoded text string.
89 * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
90 */
91 public static String decodeText(final String text) throws UnsupportedEncodingException {
92 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the
93 // source string doesn't contain that sequent, no decoding is required.
94 if (!text.contains(ENCODED_TOKEN_MARKER)) {
95 return text;
96 }
97
98 int offset = 0;
99 final int endOffset = text.length();
100
101 int startWhiteSpace = -1;
102 int endWhiteSpace = -1;
103
104 final StringBuilder decodedText = new StringBuilder(text.length());
105
106 boolean previousTokenEncoded = false;
107
108 while (offset < endOffset) {
109 char ch = text.charAt(offset);
110
111 // is this a whitespace character?
112 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
113 startWhiteSpace = offset;
114 while (offset < endOffset) {
115 // step over the white space characters.
116 ch = text.charAt(offset);
117 if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
118 // record the location of the first non lwsp and drop down to process the
119 // token characters.
120 endWhiteSpace = offset;
121 break;
122 }
123 offset++;
124 }
125 } else {
126 // we have a word token. We need to scan over the word and then try to parse it.
127 final int wordStart = offset;
128
129 while (offset < endOffset) {
130 // step over the non white space characters.
131 ch = text.charAt(offset);
132 if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
133 break;
134 }
135 offset++;
136
137 //NB: Trailing whitespace on these header strings will just be discarded.
138 }
139 // pull out the word token.
140 final String word = text.substring(wordStart, offset);
141 // is the token encoded? decode the word
142 if (word.startsWith(ENCODED_TOKEN_MARKER)) {
143 try {
144 // if this gives a parsing failure, treat it like a non-encoded word.
145 final String decodedWord = decodeWord(word);
146
147 // are any whitespace characters significant? Append 'em if we've got 'em.
148 if (!previousTokenEncoded && startWhiteSpace != -1) {
149 decodedText.append(text, startWhiteSpace, endWhiteSpace);
150 startWhiteSpace = -1;
151 }
152 // this is definitely a decoded token.
153 previousTokenEncoded = true;
154 // and add this to the text.
155 decodedText.append(decodedWord);
156 // we continue parsing from here...we allow parsing errors to fall through
157 // and get handled as normal text.
158 continue;
159
160 } catch (final ParseException e) {
161 // just ignore it, skip to next word
162 }
163 }
164 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
165 // if we have it.
166 if (startWhiteSpace != -1) {
167 decodedText.append(text, startWhiteSpace, endWhiteSpace);
168 startWhiteSpace = -1;
169 }
170 // this is not a decoded token.
171 previousTokenEncoded = false;
172 decodedText.append(word);
173 }
174 }
175
176 return decodedText.toString();
177 }
178
179 /**
180 * Parse a string using the RFC 2047 rules for an "encoded-word"
181 * type. This encoding has the syntax:
182 *
183 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
184 *
185 * @param word The possibly encoded word value.
186 * @return The decoded word.
187 * @throws ParseException in case of a parse error of the RFC 2047
188 * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found
189 */
190 private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
191 // encoded words start with the characters "=?". If this not an encoded word, we throw a
192 // ParseException for the caller.
193
194 if (!word.startsWith(ENCODED_TOKEN_MARKER)) {
195 throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
196 }
197
198 final int charsetPos = word.indexOf('?', 2);
199 if (charsetPos == -1) {
200 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
201 }
202
203 // pull out the character set information (this is the MIME name at this point).
204 final String charset = word.substring(2, charsetPos).toLowerCase(Locale.ROOT);
205
206 // now pull out the encoding token the same way.
207 final int encodingPos = word.indexOf('?', charsetPos + 1);
208 if (encodingPos == -1) {
209 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
210 }
211
212 final String encoding = word.substring(charsetPos + 1, encodingPos);
213
214 // and finally the encoded text.
215 final int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
216 if (encodedTextPos == -1) {
217 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
218 }
219
220 final String encodedText = word.substring(encodingPos + 1, encodedTextPos);
221
222 // seems a bit silly to encode a null string, but easy to deal with.
223 if (encodedText.isEmpty()) {
224 return "";
225 }
226
227 try {
228 // the decoder writes directly to an output stream.
229 final ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
230
231 final byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET);
232
233 // Base64 encoded?
234 if (encoding.equals(BASE64_ENCODING_MARKER)) {
235 out.write(Base64.getDecoder().decode(encodedData));
236 } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
237 QuotedPrintableDecoder.decode(encodedData, out);
238 } else {
239 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
240 }
241 // get the decoded byte data and convert into a string.
242 final byte[] decodedData = out.toByteArray();
243 return new String(decodedData, javaCharset(charset));
244 } catch (final Exception e) {
245 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
246 }
247 }
248
249 /**
250 * Translate a MIME standard character set name into the Java
251 * equivalent.
252 *
253 * @param charset The MIME standard name.
254 * @return The Java equivalent for this name.
255 */
256 private static String javaCharset(final String charset) {
257 // nothing in, nothing out.
258 if (charset == null) {
259 return null;
260 }
261
262 final String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ROOT));
263 // if there is no mapping, then the original name is used. Many of the MIME character set
264 // names map directly back into Java. The reverse isn't necessarily true.
265 if (mappedCharset == null) {
266 return charset;
267 }
268 return mappedCharset;
269 }
270
271 /**
272 * Hidden constructor, this class must not be instantiated.
273 */
274 private MimeUtility() {
275 // do nothing
276 }
277
278 }