//
//  CharacterPronunciations.m
//  FCIMDatabaseBuilder
//
//  Created by Andrew Choi on 19/08/08.
//  Copyright 2008 Andrew Choi. All rights reserved.
//

/*
 
 Permission for the use of this code is granted only for research, educational, and non-commercial purposes.
 
 Redistribution of this code or its parts in source, binary, and any other form without permission, with or without modification, is prohibited.  Modifications include, but are not limited to, translation to other programming languages and reuse of tables, constant definitions, and API's defined in it.
 
 Andrew Choi is not liable for any losses or damages caused by the use of this software.
 
 */

/* This class implements a parser for the file Unihan.txt and produces a dictionary that maps Han characters to their cantonese pronunciations.  Replace this code (but keep the output dictionary in the same foramt) to switch to another pronunciation file (such as LSHK's jyutping data file).  */

#import "CharacterPronunciations.h"

@implementation CharacterPronunciations

NSString *stripToneDigit(NSString *s)
{
	unichar tone = [s characterAtIndex:[s length] - 1];
	
	if ('1' <= tone && tone <= '6')
		return [s substringToIndex:[s length] - 1];
	else
		return s;
}

NSString *fixRomanization(NSString *s)
{
	static NSDictionary *consonantMap = nil;
	if (!consonantMap)
	{
		consonantMap = [[NSDictionary alloc] initWithContentsOfFile:[NSString stringWithFormat:@"../../RomanizationMaps/%s/ConsonantMap.plist", ROMANIZATION]];
		if (!consonantMap)
			@throw [NSException exceptionWithName:@"fixRomanization:" reason:@"Can't load consonant map file" userInfo:nil];
	}
	
	static NSDictionary *vowelMap = nil;
	if (!vowelMap)
	{
		vowelMap = [[NSDictionary alloc] initWithContentsOfFile:[NSString stringWithFormat:@"../../RomanizationMaps/%s/VowelMap.plist", ROMANIZATION]];
		if (!vowelMap)
			@throw [NSException exceptionWithName:@"fixRomanization:" reason:@"Can't load vowel map file" userInfo:nil];
	}
	
	int len = [s length];
	for (int i = 0; i < len; i++)
	{
		NSString *c = [consonantMap objectForKey:[s substringToIndex:i]];
		NSString *v = [vowelMap objectForKey:[s substringFromIndex:i]];
		
		if (c && v)
			return [c stringByAppendingString:v];
	}
	
	return s;
}

NSArray *fixPronunciations(NSArray *a)
{
	NSMutableArray *result = [NSMutableArray arrayWithCapacity:[a count]];
	
	NSEnumerator *e = [a objectEnumerator];
	NSString *pronunciation;
	while ((pronunciation = [e nextObject]))
		[result addObject:fixRomanization(stripToneDigit(pronunciation))];
	
	return result;
}

NSArray *removeDuplicates(NSArray *a)
{
	NSSet *s = [NSSet setWithArray:a];
	
	return [s allObjects];
}

- (void)parseLine:(NSString *)line
{
	if ([[line substringToIndex:1] isEqualToString:@"#"])
		return;
	
	NSScanner *scanner = [NSScanner scannerWithString:line];
	
	NSString *unicodeScalarValue;
	NSString *tag;
	NSString *value;
	
	[scanner scanUpToString:@"\t" intoString:&unicodeScalarValue];
	[scanner scanString:@"\t" intoString:NULL];
	[scanner scanUpToString:@"\t" intoString:&tag];
	[scanner scanString:@"\t" intoString:NULL];
	value = [line substringFromIndex:[scanner scanLocation] + 1];
	
	if (![tag isEqualToString:@"kCantonese"])
		return;
	
	// Only handle 2-byte codes.
	if ([unicodeScalarValue length] != 6)
		return;
	
	NSScanner *scanner2 = [NSScanner scannerWithString:unicodeScalarValue];
	[scanner2 scanString:@"U+" intoString:NULL];
	unsigned int unicode;
	[scanner2 scanHexInt:&unicode];
	
	unichar unicodeUnichar = unicode;
	NSString *unicodeString = [NSString stringWithCharacters:&unicodeUnichar length:1];
	
	NSArray *pronunciations = [value componentsSeparatedByCharactersInSet:[NSCharacterSet whitespaceCharacterSet]];
	
	NSArray *fixedPronunciations = fixPronunciations(pronunciations);
	
	// Duplicates may exist because we strip tones from pronunciations.  E.g., U+5047 is pronounced either "gaa2" (holiday) or "gaa3" (false).  Strangely, Unihan.txt also contains duplicate pronunciations (same tone) for some characters!
	[dictionary setObject:removeDuplicates(fixedPronunciations) forKey:unicodeString];
}

- (CharacterPronunciations *)initWithFile:(NSString *)filename
{
    self = [super init];
    
	if (self)
	{
		dictionary = [[NSMutableDictionary alloc] initWithCapacity:0];

		NSString *pronunciationDatabase = [NSString stringWithContentsOfFile:filename encoding:NSUTF8StringEncoding error:NULL];
		
		NSScanner *scanner = [NSScanner scannerWithString:pronunciationDatabase];
		
		[scanner setCharactersToBeSkipped:[NSCharacterSet whitespaceCharacterSet]];
		
		while (![scanner isAtEnd])
		{
			// Extract a line
			NSString *line;
			[scanner scanUpToString:@"\n" intoString:&line];
			[scanner scanString:@"\n" intoString:NULL];
			
			// Process it
			[self parseLine:line];
		}
    }
    
	return self;
}

- (void)dealloc
{
	[dictionary release];
	
	[super dealloc];
}

+ (CharacterPronunciations *)characterPronunciationsWithFile:(NSString *)filename
{
	return [[[CharacterPronunciations alloc] initWithFile:filename] autorelease];
}

- (NSDictionary *)dictionary
{
	return dictionary;
}

@end
