| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| MimeUtility | 
 | 
 | 9.25;9.25 | 
| 1 |  /* | |
| 2 |   * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 |   * contributor license agreements.  See the NOTICE file distributed with | |
| 4 |   * this work for additional information regarding copyright ownership. | |
| 5 |   * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 |   * (the "License"); you may not use this file except in compliance with | |
| 7 |   * the License.  You may obtain a copy of the License at | |
| 8 |   * | |
| 9 |   *      http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 |   * | |
| 11 |   * Unless required by applicable law or agreed to in writing, software | |
| 12 |   * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 |   * See the License for the specific language governing permissions and | |
| 15 |   * limitations under the License. | |
| 16 |   */ | |
| 17 |  package org.apache.commons.fileupload.util.mime; | |
| 18 | ||
| 19 |  import java.io.ByteArrayOutputStream; | |
| 20 |  import java.io.IOException; | |
| 21 |  import java.io.UnsupportedEncodingException; | |
| 22 |  import java.util.HashMap; | |
| 23 |  import java.util.Locale; | |
| 24 |  import java.util.Map; | |
| 25 | ||
| 26 |  /** | |
| 27 |   * Utility class to decode MIME texts. | |
| 28 |   * | |
| 29 |   * @since 1.3 | |
| 30 |   */ | |
| 31 | public final class MimeUtility { | |
| 32 | ||
| 33 |      /** | |
| 34 |       * The {@code US-ASCII} charset identifier constant. | |
| 35 |       */ | |
| 36 | private static final String US_ASCII_CHARSET = "US-ASCII"; | |
| 37 | ||
| 38 |      /** | |
| 39 |       * The marker to indicate text is encoded with BASE64 algorithm. | |
| 40 |       */ | |
| 41 | private static final String BASE64_ENCODING_MARKER = "B"; | |
| 42 | ||
| 43 |      /** | |
| 44 |       * The marker to indicate text is encoded with QuotedPrintable algorithm. | |
| 45 |       */ | |
| 46 | private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q"; | |
| 47 | ||
| 48 |      /** | |
| 49 |       * If the text contains any encoded tokens, those tokens will be marked with "=?". | |
| 50 |       */ | |
| 51 | private static final String ENCODED_TOKEN_MARKER = "=?"; | |
| 52 | ||
| 53 |      /** | |
| 54 |       * If the text contains any encoded tokens, those tokens will terminate with "=?". | |
| 55 |       */ | |
| 56 | private static final String ENCODED_TOKEN_FINISHER = "?="; | |
| 57 | ||
| 58 |      /** | |
| 59 |       * The linear whitespace chars sequence. | |
| 60 |       */ | |
| 61 | private static final String LINEAR_WHITESPACE = " \t\r\n"; | |
| 62 | ||
| 63 |      /** | |
| 64 |       * Mappings between MIME and Java charset. | |
| 65 |       */ | |
| 66 | 1 | private static final Map<String, String> MIME2JAVA = new HashMap<String, String>(); | 
| 67 | ||
| 68 |      static { | |
| 69 | 1 | MIME2JAVA.put("iso-2022-cn", "ISO2022CN"); | 
| 70 | 1 | MIME2JAVA.put("iso-2022-kr", "ISO2022KR"); | 
| 71 | 1 | MIME2JAVA.put("utf-8", "UTF8"); | 
| 72 | 1 | MIME2JAVA.put("utf8", "UTF8"); | 
| 73 | 1 | MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP"); | 
| 74 | 1 | MIME2JAVA.put("ja_jp.eucjp", "EUCJIS"); | 
| 75 | 1 | MIME2JAVA.put("euc-kr", "KSC5601"); | 
| 76 | 1 | MIME2JAVA.put("euckr", "KSC5601"); | 
| 77 | 1 | MIME2JAVA.put("us-ascii", "ISO-8859-1"); | 
| 78 | 1 | MIME2JAVA.put("x-us-ascii", "ISO-8859-1"); | 
| 79 | 1 | } | 
| 80 | ||
| 81 |      /** | |
| 82 |       * Hidden constructor, this class must not be instantiated. | |
| 83 |       */ | |
| 84 | 0 |      private MimeUtility() { | 
| 85 |          // do nothing | |
| 86 | 0 |      } | 
| 87 | ||
| 88 |      /** | |
| 89 |       * Decode a string of text obtained from a mail header into | |
| 90 |       * its proper form.  The text generally will consist of a | |
| 91 |       * string of tokens, some of which may be encoded using | |
| 92 |       * base64 encoding. | |
| 93 |       * | |
| 94 |       * @param text   The text to decode. | |
| 95 |       * | |
| 96 |       * @return The decoded text string. | |
| 97 |       * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported. | |
| 98 |       */ | |
| 99 | public static String decodeText(String text) throws UnsupportedEncodingException { | |
| 100 |          // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the | |
| 101 |          // source string doesn't contain that sequent, no decoding is required. | |
| 102 | 6484 | if (text.indexOf(ENCODED_TOKEN_MARKER) < 0) { | 
| 103 | 6478 |              return text; | 
| 104 | } | |
| 105 | ||
| 106 | 6 |          int offset = 0; | 
| 107 | 6 |          int endOffset = text.length(); | 
| 108 | ||
| 109 | 6 |          int startWhiteSpace = -1; | 
| 110 | 6 |          int endWhiteSpace = -1; | 
| 111 | ||
| 112 | 6 |          StringBuilder decodedText = new StringBuilder(text.length()); | 
| 113 | ||
| 114 | 6 | boolean previousTokenEncoded = false; | 
| 115 | ||
| 116 | 19 | while (offset < endOffset) { | 
| 117 | 14 |              char ch = text.charAt(offset); | 
| 118 | ||
| 119 |              // is this a whitespace character? | |
| 120 | 14 | if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found | 
| 121 | 5 | startWhiteSpace = offset; | 
| 122 | 19 | while (offset < endOffset) { | 
| 123 |                      // step over the white space characters. | |
| 124 | 17 | ch = text.charAt(offset); | 
| 125 | 17 | if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found | 
| 126 | 14 | offset++; | 
| 127 |                      } else { | |
| 128 |                          // record the location of the first non lwsp and drop down to process the | |
| 129 |                          // token characters. | |
| 130 | 3 | endWhiteSpace = offset; | 
| 131 | 3 |                          break; | 
| 132 | } | |
| 133 | } | |
| 134 |              } else { | |
| 135 |                  // we have a word token.  We need to scan over the word and then try to parse it. | |
| 136 | 9 |                  int wordStart = offset; | 
| 137 | ||
| 138 | 416 | while (offset < endOffset) { | 
| 139 |                      // step over the non white space characters. | |
| 140 | 412 | ch = text.charAt(offset); | 
| 141 | 412 | if (LINEAR_WHITESPACE.indexOf(ch) == -1) { // not white space | 
| 142 | 407 | offset++; | 
| 143 |                      } else { | |
| 144 |                          break; | |
| 145 | } | |
| 146 | ||
| 147 |                      //NB:  Trailing whitespace on these header strings will just be discarded. | |
| 148 | } | |
| 149 |                  // pull out the word token. | |
| 150 | 9 | String word = text.substring(wordStart, offset); | 
| 151 |                  // is the token encoded?  decode the word | |
| 152 | 9 |                  if (word.startsWith(ENCODED_TOKEN_MARKER)) { | 
| 153 |                      try { | |
| 154 |                          // if this gives a parsing failure, treat it like a non-encoded word. | |
| 155 | 9 | String decodedWord = decodeWord(word); | 
| 156 | ||
| 157 |                          // are any whitespace characters significant?  Append 'em if we've got 'em. | |
| 158 | 8 |                          if (!previousTokenEncoded && startWhiteSpace != -1) { | 
| 159 | 0 |                              decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); | 
| 160 | 0 |                              startWhiteSpace = -1; | 
| 161 | } | |
| 162 |                          // this is definitely a decoded token. | |
| 163 | 8 |                          previousTokenEncoded = true; | 
| 164 |                          // and add this to the text. | |
| 165 | 8 | decodedText.append(decodedWord); | 
| 166 |                          // we continue parsing from here...we allow parsing errors to fall through | |
| 167 |                          // and get handled as normal text. | |
| 168 | 8 |                          continue; | 
| 169 | ||
| 170 | 0 |                      } catch (ParseException e) { | 
| 171 |                          // just ignore it, skip to next word | |
| 172 | } | |
| 173 | } | |
| 174 |                  // this is a normal token, so it doesn't matter what the previous token was.  Add the white space | |
| 175 |                  // if we have it. | |
| 176 | 0 |                  if (startWhiteSpace != -1) { | 
| 177 | 0 |                      decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); | 
| 178 | 0 |                      startWhiteSpace = -1; | 
| 179 | } | |
| 180 |                  // this is not a decoded token. | |
| 181 | 0 |                  previousTokenEncoded = false; | 
| 182 | 0 |                  decodedText.append(word); | 
| 183 | } | |
| 184 | 5 | } | 
| 185 | ||
| 186 | 5 |          return decodedText.toString(); | 
| 187 | } | |
| 188 | ||
| 189 |      /** | |
| 190 |       * Parse a string using the RFC 2047 rules for an "encoded-word" | |
| 191 |       * type.  This encoding has the syntax: | |
| 192 |       * | |
| 193 |       * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" | |
| 194 |       * | |
| 195 |       * @param word   The possibly encoded word value. | |
| 196 |       * | |
| 197 |       * @return The decoded word. | |
| 198 |       * @throws ParseException | |
| 199 |       * @throws UnsupportedEncodingException | |
| 200 |       */ | |
| 201 | private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException { | |
| 202 |          // encoded words start with the characters "=?".  If this not an encoded word, we throw a | |
| 203 |          // ParseException for the caller. | |
| 204 | ||
| 205 | 9 |          if (!word.startsWith(ENCODED_TOKEN_MARKER)) { | 
| 206 | 0 |              throw new ParseException("Invalid RFC 2047 encoded-word: " + word); | 
| 207 | } | |
| 208 | ||
| 209 | 9 | int charsetPos = word.indexOf('?', 2); | 
| 210 | 9 |          if (charsetPos == -1) { | 
| 211 | 0 |              throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word); | 
| 212 | } | |
| 213 | ||
| 214 |          // pull out the character set information (this is the MIME name at this point). | |
| 215 | 9 | String charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH); | 
| 216 | ||
| 217 |          // now pull out the encoding token the same way. | |
| 218 | 9 | int encodingPos = word.indexOf('?', charsetPos + 1); | 
| 219 | 9 |          if (encodingPos == -1) { | 
| 220 | 0 |              throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word); | 
| 221 | } | |
| 222 | ||
| 223 | 9 | String encoding = word.substring(charsetPos + 1, encodingPos); | 
| 224 | ||
| 225 |          // and finally the encoded text. | |
| 226 | 9 |          int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1); | 
| 227 | 9 |          if (encodedTextPos == -1) { | 
| 228 | 0 |              throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word); | 
| 229 | } | |
| 230 | ||
| 231 | 9 | String encodedText = word.substring(encodingPos + 1, encodedTextPos); | 
| 232 | ||
| 233 |          // seems a bit silly to encode a null string, but easy to deal with. | |
| 234 | 9 |          if (encodedText.length() == 0) { | 
| 235 | 0 |              return ""; | 
| 236 | } | |
| 237 | ||
| 238 |          try { | |
| 239 |              // the decoder writes directly to an output stream. | |
| 240 | 9 |              ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length()); | 
| 241 | ||
| 242 | 9 |              byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET); | 
| 243 | ||
| 244 |              // Base64 encoded? | |
| 245 | 9 | if (encoding.equals(BASE64_ENCODING_MARKER)) { | 
| 246 | 8 | Base64Decoder.decode(encodedData, out); | 
| 247 | 1 |              } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable. | 
| 248 | 1 | QuotedPrintableDecoder.decode(encodedData, out); | 
| 249 |              } else { | |
| 250 | 0 |                  throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); | 
| 251 | } | |
| 252 |              // get the decoded byte data and convert into a string. | |
| 253 | 8 |              byte[] decodedData = out.toByteArray(); | 
| 254 | 8 | return new String(decodedData, javaCharset(charset)); | 
| 255 | 1 |          } catch (IOException e) { | 
| 256 | 1 | throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); | 
| 257 | } | |
| 258 | } | |
| 259 | ||
| 260 |      /** | |
| 261 |       * Translate a MIME standard character set name into the Java | |
| 262 |       * equivalent. | |
| 263 |       * | |
| 264 |       * @param charset The MIME standard name. | |
| 265 |       * | |
| 266 |       * @return The Java equivalent for this name. | |
| 267 |       */ | |
| 268 | private static String javaCharset(String charset) { | |
| 269 |          // nothing in, nothing out. | |
| 270 | 8 |          if (charset == null) { | 
| 271 | 0 |              return null; | 
| 272 | } | |
| 273 | ||
| 274 | 8 | String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH)); | 
| 275 |          // if there is no mapping, then the original name is used.  Many of the MIME character set | |
| 276 |          // names map directly back into Java.  The reverse isn't necessarily true. | |
| 277 | 8 | if (mappedCharset == null) { | 
| 278 | 6 |              return charset; | 
| 279 | } | |
| 280 | 2 |          return mappedCharset; | 
| 281 | } | |
| 282 | ||
| 283 | } |