Archive for May, 2005

Removing entities from HTML in Cocoa

Saturday, May 7th, 2005

To display accented characters and certain symbols in a HTML or XML document you need to encode them. For example the copyright symbol © is represented in HTML as ©

Applications like NewsMac Pro need to be able to decode these entities and translate them to the appropriate character. Straightforward you might think, but actually it isn’t. There are multiple ways in which characters can be encoded, as before with a textual name, but also with a decimal or hex value. In NewsMac Pro I used to use NSAttributtedString’s initWithHTML method, however for what ever reason this seem to lock up under Tiger, so I had to find an alternative solution. I thought I’d post the following code to help out other developers because if you go searching on this topic you will most likely get people telling you to use the NSAttributedString method.

This probably isn’t the most elegant bit of code ever, but it serves its purpose:

+ (NSString *) decodeCharacterEntitiesIn:(NSString *)source
{
  if(!source) return nil;
  else if([source rangeOfString: @"&"].location == NSNotFound) return source;
  else
  {
    NSMutableString *escaped = [NSMutableString stringWithString: source];
    NSArray *codes = [NSArray arrayWithObjects:
      @"&", @"<", @">", @""",
      @" ", @"¡", @"¢", @"£", @"¤", @"¥", @"¦",
      @"§", @"¨", @"©", @"ª", @"«", @"¬", @"­", @"®",
      @"¯", @"°", @"±", @"²", @"³", @"´", @"µ",
      @"¶", @"·", @"¸", @"¹", @"º", @"»", @"¼",
      @"½", @"¾", @"¿", @"À", @"Á", @"Â",
      @"Ã", @"Ä", @"Å", @"Æ", @"Ç", @"È",
      @"É", @"Ê", @"Ë", @"Ì", @"Í", @"Î", @"Ï",
      @"Ð", @"Ñ", @"Ò", @"Ó", @"Ô", @"Õ", @"Ö",
      @"×", @"Ø", @"Ù", @"Ú", @"Û", @"Ü", @"Ý",
      @"Þ", @"ß", @"à", @"á", @"â", @"ã", @"ä",
      @"å", @"æ", @"ç", @"è", @"é", @"ê", @"ë",
      @"ì", @"í", @"î", @"ï", @"ð", @"ñ", @"ò",
      @"ó", @"ô", @"õ", @"ö", @"÷", @"ø", @"ù",
      @"ú", @"û", @"ü", @"ý", @"þ", @"ÿ", nil];

    int i, count = [codes count];

    // Html
    for(i = 0; i < count; i++)
    {
      NSRange range = [source rangeOfString: [codes objectAtIndex: i]];
      if(range.location != NSNotFound)
      {
        [escaped replaceOccurrencesOfString: [codes objectAtIndex: i]
                                 withString: [NSString stringWithFormat: @"%C", 160 + i]
                                    options: NSLiteralSearch
                                      range: NSMakeRange(0, [escaped length])];
      }
    }

    // Decimal & Hex
    NSRange start, finish, searchRange = NSMakeRange(0, [escaped length]);
    i = 0;

    while(i < [escaped length])
    {
      start = [escaped rangeOfString: @"&#"
                               options: NSCaseInsensitiveSearch
                                 range: searchRange];

      finish = [escaped rangeOfString: @";"
                                options: NSCaseInsensitiveSearch
                                  range: searchRange];

      if(start.location != NSNotFound && finish.location != NSNotFound &&
         finish.location > start.location)
      {
        NSRange entityRange = NSMakeRange(start.location, (finish.location - start.location) + 1);
        NSString *entity = [escaped substringWithRange: entityRange];
        NSString *value = [entity substringWithRange: NSMakeRange(2, [entity length] - 2)];

        [escaped deleteCharactersInRange: entityRange];

        if([value hasPrefix: @"x"])
        {
          int tempInt = 0;
          NSScanner *scanner = [NSScanner scannerWithString: [value substringFromIndex: 1]];
          [scanner scanHexInt: &tempInt];
          [escaped insertString: [NSString stringWithFormat: @"%C", tempInt] atIndex: entityRange.location];
        }
        else
        {
          [escaped insertString: [NSString stringWithFormat: @"%C", [value intValue]] atIndex: entityRange.location];
        }
        i = start.location;
      }
      else i++;
      searchRange = NSMakeRange(i, [escaped length] - i);
    }

    return escaped;    // Note this is autoreleased
  }
}