// Lexer.cpp
// (c) 2004 exeal

#include "StdAfx.h"
#include "Lexer.h"
#include "TextSearcher.h"	// CBoundarySearcher::IsFirstCharacterOfCluster

using namespace Ascension;
using namespace std;
using namespace Manah::Text;

bool Private::TKeywordComparer::bCaseSensitive = true;


namespace {
	const pair<char_t, char_t>	bracketPairs[] = {	// Unicode 4.0  Ps APe  XML p '<'  '>'
		make_pair(0x0028, 0x0029),	// Parenthesis
		make_pair(0x003C, 0x003E),	// Less-Than/Greater-Than Sign
		make_pair(0x005B, 0x005D),	// Square Bracket
		make_pair(0x007B, 0x007D),	// Curly Bracket
		make_pair(0x0F3A, 0x0F3B),	// Tibetan Mark Gug Rtags Gyon and Gyas
		make_pair(0x0F3C, 0x0F3D),	// Tibetan Mark Ang Khang Gyon and Gyas
		make_pair(0x169B, 0x169C),	// Ogham Feather Maek and reversed one
//		make_pair(0x201A, 0x????),	// Single Low-9 Quotation Mark
//		make_pair(0x201E, 0x????),	// Double Low-9 Quotation Mark
		make_pair(0x2045, 0x2046),	// Square Bracket With Quill
		make_pair(0x207D, 0x207E),	// Superscript Parenthesis
		make_pair(0x208D, 0x208E),	// Subscript Parenthesis
		make_pair(0x2329, 0x232A),	// Pointing Angle Bracket
		make_pair(0x23B4, 0x23B5),	// Square Bracket (top/bottom)
		make_pair(0x2768, 0x2769),	// Medium Parenthesis Ornament
		make_pair(0x276A, 0x276B),	// Medium Flattened Parenthesis Ornament
		make_pair(0x276C, 0x276D),	// Medium Pointing Angle Bracket Ornament
		make_pair(0x276E, 0x276F),	// Heavy Pointing Angle Quotation Mark Ornament
		make_pair(0x2770, 0x2771),	// Heavy Pointing Angle Bracket Ornament
		make_pair(0x2772, 0x2773),	// Light Tortoise Shell Bracket Ornament
		make_pair(0x2774, 0x2775),	// Medium Curly Bracket Ornament
		make_pair(0x27E6, 0x27E7),	// Mathematical White Square Bracket
		make_pair(0x27E8, 0x27E9),	// Mathematical Angle Bracket
		make_pair(0x27EA, 0x27EB),	// Mathematical Double Angle Bracket
		make_pair(0x2983, 0x2984),	// White Curly Barcket
		make_pair(0x2985, 0x2986),	// White Parenthesis
		make_pair(0x2987, 0x2988),	// Z Notation Image Bracket
		make_pair(0x2989, 0x298A),	// Z Notation Binding Bracket
		make_pair(0x298B, 0x298C),	// Square Bracket With Underbar
		make_pair(0x298D, 0x298E),	// Left Square Bracket With Tick In Top Corner and Right ... Bottom
		make_pair(0x298F, 0x2990),	// Left Square Bracket With Tick In Bottom Corner and Right ... Top
		make_pair(0x2991, 0x2992),	// Angle Bracket With Dot
		make_pair(0x2993, 0x2994),	// Arc Less-Than Bracket
		make_pair(0x2995, 0x2996),	// Double Arc Greater-Than Bracket
		make_pair(0x2997, 0x2998),	// Black Tortoise Shell Bracket
		make_pair(0x29D8, 0x29D9),	// Wiggly Fence
		make_pair(0x29DA, 0x29DB),	// Double Wiggly Fence
		make_pair(0x29FC, 0x29FD),	// Pointing Curved Angle Bracket
		make_pair(0x3008, 0x3009),	// Angle Bracket
		make_pair(0x300A, 0x300B),	// Double Angle Bracket
		make_pair(0x300C, 0x300D),	// Corner Bracket
		make_pair(0x300E, 0x300F),	// White Corner Bracket
		make_pair(0x3010, 0x3011),	// Black Lenticular Bracket
		make_pair(0x3014, 0x3015),	// Tortoise Shell Bracket
		make_pair(0x3016, 0x3017),	// White Lenticular Bracket
		make_pair(0x3018, 0x3019),	// White Tortoise Shell Bracket
		make_pair(0x301A, 0x301B),	// White Square Bracket
		make_pair(0x301D, 0x301F),	// Double Prime Quotation Mark and reversed one
//		make_pair(0x????, 0x301E),	// Double Prime Quotation Mark (deprecated: mistaken analogue)
		make_pair(0xFD3E, 0xFD3F),	// Ornate Parenthesis
		make_pair(0xFE35, 0xFE36),	// Presentation Form For Vertical Parenthesis
		make_pair(0xFE37, 0xFE38),	// - Curly Bracket
		make_pair(0xFE39, 0xFE3A),	// - Tortoise Shell Bracket
		make_pair(0xFE3B, 0xFE3C),	// - Black Lenticular Bracket
		make_pair(0xFE3D, 0xFE3E),	// - Double Angle Bracket
		make_pair(0xFE3F, 0xFE40),	// - Angle Bracket
		make_pair(0xFE41, 0xFE42),	// - Corner Bracket
		make_pair(0xFE43, 0xFE44),	// - White Corner Bracket
		make_pair(0xFE45, 0xFE46),	// Sesame Dot and White one
		make_pair(0xFE47, 0xFE48),	// - Square Bracket
		make_pair(0xFE59, 0xFE5A),	// Small Parenthesis
		make_pair(0xFE5B, 0xFE5C),	// Small Curly Bracket
		make_pair(0xFE5D, 0xFE5E),	// Small Tortoise Shell Bracket
		make_pair(0xFF08, 0xFF09),	// Fullwidth Parenthesis
		make_pair(0xFF3B, 0xFF3D),	// Fullwidth Square Bracket
		make_pair(0xFF5B, 0xFF5D),	// Fullwidth Curly Bracket
		make_pair(0xFF5F, 0xFF60),	// Fullwidth White Parenthesis
		make_pair(0xFF62, 0xFF63),	// Halfwidth Corner Bracket
		make_pair(0x0000, 0x0000)	// _~[
	};

#if ASCENSION_UNICODE_VERSION != 0x0400
#error These array is based on old version of Unicode.
#endif
	// DerivedCoreProperties.txt  ID_Start vpeBR[h|Cg
	// idstart.pl 莩 (Unicode 4.0)
	const Manah::Text::CodePoint	_arrIDStart[] = {
#include "script\Lexer_IdentifierStart_4_0"
	};
	// ʕނ Mn (Mark, Non-Spacing)AMc (Mark, Spacing Combining)A
	// Nd (Number, Decimal)APc (Punctuation, Connector) ̂ꂩłR[h|Cg
	// unicat.pl 莩 (Unicode 4.0)
	const Manah::Text::CodePoint	_arrIDContinue[] = {
#include "script\Lexer_IdentifierContinue_4_0"
	};
}


// CLexer class implementation
/////////////////////////////////////////////////////////////////////////////

///	ʓIȊJʂ̃Xg
const char_t	CLexer::m_wszDefaultOpeners[] = L"([{";
///	Unicode ̊Jʂ̃Xg
const char_t	CLexer::m_wszUnicodeOpeners[] = L"([{\x0F3A\xF3C\x169B\x2045\x207D\x208D\x2329\x23B4"
												L"\x2768\x276A\x276C\x276E\x2770\x2772\x2774\x27E6"
												L"\x27E8\x27EA\x2983\x2985\x2987\x2989\x298B\x298D"
												L"\x298F\x2991\x2993\x2995\x2997\x29D8\x29DA\x29FC"
												L"\x3008\x300A\x300C\x300E\x3010\x3014\x3016\x3018"
												L"\x301A\x301D\xFD3E\xFE35\xFE37\xFE39\xFE3B\xFE3D"
												L"\xFE3F\xFE41\xFE43\xFE45\xFE47\xFE59\xFE5B\xFE5D"
												L"\xFF08\xFF3B\xFF5B\xFF5F\xFF62";
TokenCookie		CLexer::m_nCookie = NullCookie + 1;

/**
 *	RXgN^
 *	@param pEventListener	Cxgnh (null ł悢)
 */
CLexer::CLexer(ILexerEventListener* pEventListener) :
		m_bFreezed(false), m_bCaseSensitive(true),
		m_bEnableUnicodeAlphabets(true), m_bEnableUnicodeWhiteSpaces(true),
		m_numberFormat(NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL),
		m_pwszBrackets(0), m_pEventListener(pEventListener) {
	fill(m_enabledTokenTypes, m_enabledTokenTypes + _countof(m_enabledTokenTypes), true);
	SetBrackets(CLexer::m_wszDefaultOpeners);
}

///	fXgN^
CLexer::~CLexer() {
	delete[] m_pwszBrackets;
}

/**
 *	L[[h`ǉ
 *	@param setKeywords	ǉL[[h̏W
 *	@return				NbL[
 */
TokenCookie CLexer::AddKeywords(const set<string_t>& keywords) {
	AssertValid();
	Private::TKeywordComparer::bCaseSensitive = m_bCaseSensitive;

	KeywordSet	keywordSet(keywords.begin(), keywords.end());

	m_keywords.insert(make_pair(CLexer::m_nCookie, keywordSet));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_KEYWORD, CLexer::m_nCookie);
	NotifyChange();
	return CLexer::m_nCookie++;
}

/**
 *	sRg`ǉ
 *	@param strStartDelimiter	Jn
 *	@param strEndDelimiter		I
 *	@param ar					Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddMultilineAnnotation(const string_t& strStartDelimiter,
		const string_t& strEndDelimiter, AnnotationRestriction ar /* = AR_NONE */) {
	AssertValid();

	const TMultilineAnnotation	annotation = {strStartDelimiter, strEndDelimiter, ar};

	m_multilineAnnotations.insert(make_pair(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param strStartDelimiter	Jn
 *	@param ar					Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddSinglelineAnnotation(
		const string_t& strStartDelimiter, AnnotationRestriction ar /* = AR_NONE */) {
	AssertValid();

	const TSinglelineAnnotationEndedByBreak	annotation = {strStartDelimiter, ar};

	m_singlelineAnnotationBs.insert(make_pair(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param strStartDelimiter	Jn
 *	@param strEndDelimiter		I
 *	@param ar					Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddSinglelineAnnotation(const string_t& strStartDelimiter,
		const string_t& strEndDelimiter, AnnotationRestriction ar /* = AR_NONE */) {
	AssertValid();

	const TSinglelineAnnotationEndedByDelimiter	annotation = {
		strStartDelimiter, strEndDelimiter, ar
	};

	m_singlelineAnnotationDs.insert(make_pair(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	w肳ꂽʂ̓Ԃ
 *	@param chBracket	ׂ銇
 *	@param chPair		[out] Ί
 *	@param bOpener		[out] <var>chBracket</var> JʂȂ true
 *	@return				<var>chBracket</var> gp\ȊʂɊ܂܂Ă true
 */
bool CLexer::GetBracketTraits(char_t chBracket, char_t& chPair, bool& bOpener) const {
	const char_t*	pwszFound = wcschr(m_pwszBrackets, chBracket);

	if(pwszFound == 0)
		return false;
	else if((pwszFound - m_pwszBrackets) % 2 == 0) {
		chPair = pwszFound[1];
		bOpener = true;
	} else {
		chPair = pwszFound[-1];
		bOpener = false;
	}
	return true;
}

/**
 *	L[[h̑啶ʂ邩ǂ̐ݒB
 *	ݒύXƓo^ĂL[[h͑Sč폜
 *	@param bIgnore	ʂȂꍇ true
 */
void CLexer::IgnoreCase(bool bIgnore) {
	AssertValid();
	if(bIgnore == m_bCaseSensitive) {
		m_bCaseSensitive = !bIgnore;
		m_keywords.clear();
		NotifyChange();
	}
}

/**
 *	ʎq\𔻒肷B̃\bh
 *	Unicode  ID_Continue vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎq\ǂ
 *	@see		CLexer::IsIdentifier, CLexer::IsIdentifierStartChar
 */
bool CLexer::IsIdentifierContinueCodePoint(CodePoint cp) const {
	AssertValid();

	if(m_bEnableUnicodeAlphabets)
		return IsIdentifierStartCodePoint(cp)
			|| binary_search(_arrIDContinue, _arrIDContinue + _countof(_arrIDContinue), cp);
	else
		return IsIdentifierStartCodePoint(cp)
			|| (cp == L'_')
			|| (cp >= L'0' && cp <= L'9');
}

/**
 *	ʎqJn𔻒肷B̃\bh
 *	Unicode  ID_Start vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎqJnǂ
 *	@see		CLexer::IsIdentifier, CLexer::IsIdentifierContinueChar
 */
bool CLexer::IsIdentifierStartCodePoint(CodePoint cp) const {
	AssertValid();

	if(binary_search(m_additionalAlphabets.begin(), m_additionalAlphabets.end(), cp))
		return true;
	if(m_bEnableUnicodeAlphabets)
		return binary_search(_arrIDStart, _arrIDStart + _countof(_arrIDStart), cp)
			|| (cp >= 0x1401 && cp <= 0x166C)		// Canadian Syllabics
			|| (cp >= 0x3400 && cp <= 0x4DB5)		// CJK Unified Ideograph
			|| (cp >= 0x4E00 && cp <= 0x9FA5)		// CJK Unified Ideograph
			|| (cp >= 0xA000 && cp <= 0xA48C)		// Yi Syllable
			|| (cp >= 0xAC00 && cp <= 0xD7A3)		// Hangul Syllable
			|| (cp >= 0x20000 && cp <= 0x2A6D6)		// CJK Unified Ideograph
			|| (cp >= 0x2F800 && cp <= 0x2FA1D);	// CJK Compatibility Ideograph
	else
		return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z');
}

/**
 *	L[[h̒Ԃ
 *	@param str		ׂ镶
 *	@param nCookie	[out] L[[hɊ֘AtꂽNbL[l
 *	@return			L[[hłꍇ true
 */
bool CLexer::IsKeyword(const string_t& str, TokenCookie& nCookie) const {
	AssertValid();
	assert(!str.empty());

	Private::TKeywordComparer::bCaseSensitive = m_bCaseSensitive;
	for(KeywordsMap::const_iterator it = m_keywords.begin(); it != m_keywords.end(); ++it) {
		KeywordSet::const_iterator	itKeyword = it->second.find(str);
		if(itKeyword != it->second.end()) {
			nCookie = it->first;
			return true;
		}
	}
	return false;
}

/**
 *	sߕ̒Ԃ (JnAIɓ)
 *	@param pwsz			ׂ镶
 *	@param cch			
 *	@param ar			ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param nCookie		[in, out] ׂ镶̒OsRgłꍇ
 *						̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *						񂪕sRgł΂Ɋ֘AtꂽNbL[̒lݒ肳
 *						(o͂ɂ NullCookie ͕sRgłȂƂ\̂Ɏg)
 *	@param bContinued	sRg̍sŏIĂȂ true
 *	@return				PsRgł true
 *	@exception invalid_argument	<var>nCookie</var> ȂƂX[
 */
length_t CLexer::IsMultilineAnnotation(const char_t* pwsz, length_t cch,
		AnnotationRestriction ar, TokenCookie& nCookie, bool& bContinued) const throw(invalid_argument) {
	AssertValid();
	assert(pwsz != 0);

	length_t	i = 0;

	bContinued = false;
	if(nCookie == NullCookie) {	// s߂̊Jnf~^T
		for(MAnnotationMap::const_iterator it =
				m_multilineAnnotations.begin(); it != m_multilineAnnotations.end(); ++it) {
			if(((it->second.ar & ar) != it->second.ar)
					|| it->second.strStartDelimiter.length() > cch)
				continue;
			else if(wcsncmp(pwsz,
					it->second.strStartDelimiter.data(),
					it->second.strStartDelimiter.length()) == 0) {
				nCookie = it->first;
				i = it->second.strStartDelimiter.length();
				break;
			}
		}
		if(nCookie == NullCookie)
			return 0;
	}
	bContinued = true;
	
	// sRg̏If~^T
	MAnnotationMap::const_iterator	it = m_multilineAnnotations.find(nCookie);

	if(it == m_multilineAnnotations.end())
		throw invalid_argument("Input cookie value is invalid.");

	const char_t*	pEnd = search(pwsz + i, pwsz + cch,
		it->second.strEndDelimiter.begin(), it->second.strEndDelimiter.end());	// [
	if(pEnd == pwsz + cch)
		return cch;
	bContinued = false;
	return pEnd - pwsz + it->second.strEndDelimiter.length();

//	const length_t	iEnd = string_t(pwsz, cch).find(it->second.strEndDelimiter, i);	// x
//	if(iEnd == string_t::npos)
//		return cch;
//	bContinued = false;
//	return iEnd + it->second.strEndDelimiter.length();
}

/**
 *	l̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		l̒BlłȂ0
 */
length_t CLexer::IsNumber(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	if(cch == 0 || !toBoolean(iswdigit(pwsz[0])))	// 1
		return 0;

	if(m_numberFormat == NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL) {	// 
		CodePoint	cp;
		for(length_t i = 1; i < cch; ++i) {
			if(i < cch - 1
					&& IsUTF16HighSurrogate(pwsz[i])
					&& IsUTF16LowSurrogate(pwsz[i + 1]))
				cp = DecodeUTF16SurrogatePairToCodePoint(pwsz + i, cch - i);
			else
				cp = pwsz[i];
			if(pwsz[i] == L'.' || IsIdentifierContinueCodePoint(cp)) {
				if(cp > 0xFFFF)
					++i;
			}
			else
				return i;
		}
		return cch;
	} else if(m_numberFormat == NF_CPLUSPLUS) {	// C++
	} else if(m_numberFormat == NF_PERL) {	// Perl 5
	} else if(m_numberFormat == NF_RUBY) {	// Ruby 1.8
	} else if(m_numberFormat == NF_VBSCRIPT) {	// VBScript 5.6
	} else if(m_numberFormat == NF_JAVASCRIPT_15) {	// JavaScript 1.5 (ECMAScript 3)
	} else if(m_numberFormat == NF_JAVASCRIPT_20) {	// JavaScript 2.0 (ECMAScript 4)
	}

	return 0;
}

/**
 *	Zq̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		Zq̒BZqłȂ0
 */
length_t CLexer::IsOperator(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	if(cch == 0)
		return 0;

	const OperatorMap::const_iterator	it = m_operators.find(pwsz[0]);

	if(it == m_operators.end())
		return 0;

	const OperatorSet&	operators = it->second;
	for(OperatorSet::const_iterator it =
			operators.begin(); it != operators.end(); ++it) {
		if(it->length() > cch)
			continue;
		else if(wcsncmp(pwsz, it->data(), it->length()) == 0)
			return it->length();
	}
	return 0;
}

/**
 *	Ps߂Ԃ
 *	@param pwsz		ׂ镶
 *	@param cch		
 *	@param ar		ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param nCookie	[out] PsRgɊ֘AtꂽNbL[l
 *	@return			PsRgł΂̒BȊO0
 */
length_t CLexer::IsSinglelineAnnotation(const char_t* pwsz,
		length_t cch, AnnotationRestriction ar, TokenCookie& nCookie) const {
	AssertValid();

	// sŏI钍߂
	for(SAnnotationBMap::const_iterator it =
			m_singlelineAnnotationBs.begin(); it != m_singlelineAnnotationBs.end(); ++it) {
		if(((it->second.ar & ar) != it->second.ar)
				|| it->second.strStartDelimiter.length() > cch)
			continue;
		else if(wcsncmp(pwsz, it->second.strStartDelimiter.data(),
				it->second.strStartDelimiter.length()) == 0) {
			nCookie = it->first;
			return cch;
		}
	}

	// wf~^ŏI钍߂
	SAnnotationDMap::const_iterator	itD;
	for(itD = m_singlelineAnnotationDs.begin(); itD != m_singlelineAnnotationDs.end(); ++itD) {
		if(((itD->second.ar & ar) != itD->second.ar)
				|| itD->second.strStartDelimiter.length() > cch)
			continue;
		else if(wcsncmp(pwsz,
				itD->second.strStartDelimiter.data(),
				itD->second.strStartDelimiter.length()) == 0) {
			nCookie = itD->first;
			break;
		}
	}
	if(itD == m_singlelineAnnotationDs.end())
		return 0;
	const char_t*	pwszEnd = wcsstr(
		pwsz + itD->second.strStartDelimiter.length(), itD->second.strEndDelimiter.c_str());
	return (pwszEnd != 0) ? pwszEnd - pwsz + itD->second.strEndDelimiter.length() : cch;
}

/**
 *	͂Ag[Nɕ
 *	@param str		
 *	@param nCookie	[in, out] ׂ镶̒OsRgłꍇ
 *					̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *					ׂ̖̕sRgɊ֘AtꂽNbL[̒lݒ肳
 *					(o͂ɂ NullCookie ͕sRgłȂƂ\̂Ɏg)
 *	@param tokens	[out] g[ÑXg
 */
void CLexer::Parse(const string_t& str, TokenCookie& nCookie, TokenList& tokens) const {
	AssertValid();

	CToken			token;
	const char_t*	pwsz = str.c_str();
	const length_t	cch = str.length();
	bool			bMCommentContinued;

	token.m_i = 0;
	if(nCookie != NullCookie) {	// ׂ镶̑O畡sRgĂꍇ
		token.SetCookie(nCookie);
		const length_t	cchToken = IsMultilineAnnotation(pwsz, cch,
			AR_ONLYSTARTOFLINE | AR_ONLYHEADOFLINE, nCookie, bMCommentContinued);
		token.SetType(TT_ANNOTATION);
		tokens.push_back(token);
		token.m_i = cchToken;
		if(!bMCommentContinued)
			nCookie = NullCookie;
	}
	token.SetCookie(NullCookie);

	length_t				i = token.m_i;
	length_t				cchToken;
	TokenCookie				nTokenCookie = NullCookie;
	AnnotationRestriction	ar;
	bool					bAppearedUnspace = false;
	while(i < cch) {
		// ⑫K̓K
		ar = AR_NONE;
		if(i == 0)				ar |= AR_ONLYSTARTOFLINE;
		if(!bAppearedUnspace)	ar |= AR_ONLYHEADOFLINE;

		if(m_enabledTokenTypes[TT_TAB] && pwsz[i] == L'\t') {	// ^u
			cchToken = 1;
			token.SetType(TT_TAB);
		} else if(m_enabledTokenTypes[TT_WHITESPACE]	// 󔒗ޕ
				&& 0 != (cchToken = IsWhiteSpace(pwsz + i, cch - i, false)))
			token.SetType(TT_WHITESPACE);
		else if(m_enabledTokenTypes[TT_NUMBER]	// l
				&& 0 != (cchToken = IsNumber(pwsz + i, cch - i)))
			token.SetType(TT_NUMBER);
		else if(m_enabledTokenTypes[TT_SINGLEQUOTATION]	// dp
				&& pwsz[i] == L'\''
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\'')))
			token.SetType(TT_SINGLEQUOTATION);
		else if(m_enabledTokenTypes[TT_DOUBLEQUOTATION]	// dp
				&& pwsz[i] == L'\"'
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\"')))
			token.SetType(TT_DOUBLEQUOTATION);
		else if(m_enabledTokenTypes[TT_OTHERQUOTATION]	// ̑̈p
				&& pwsz[i] != L'\'' && pwsz[i] != L'\"'
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, pwsz[i])))
			token.SetType(TT_OTHERQUOTATION);
		else if(m_enabledTokenTypes[TT_ANNOTATION]	// Ps
				&& 0 != (cchToken = IsSinglelineAnnotation(pwsz + i, cch - i, ar, nTokenCookie))) {
			token.SetCookie(nTokenCookie);
			token.SetType(TT_ANNOTATION);
		} else if(m_enabledTokenTypes[TT_ANNOTATION]	// s
				&& 0 != (cchToken = IsMultilineAnnotation(pwsz + i,
				cch - i, ar, nTokenCookie, bMCommentContinued))) {
			token.SetCookie(nTokenCookie);
			token.SetType(TT_ANNOTATION);
			if(bMCommentContinued)	// ̍sɑ
				nCookie = nTokenCookie;
		} else if(m_enabledTokenTypes[TT_OPERATOR]	// Zq
				&& 0 != (cchToken = IsOperator(pwsz + i, cch - i)))
			token.SetType(TT_OPERATOR);
		else if(m_enabledTokenTypes[TT_NUMERAL]	// 
				&& 0 != (cchToken = IsNumerals(pwsz + i, cch - i)))
			token.SetType(TT_NUMERAL);
		else if(m_enabledTokenTypes[TT_ASCII_CONTROL]	// ASCII 䕶
				&& 0 != (cchToken = IsAsciiControl(pwsz + i, cch - i)))
			token.SetType(TT_ASCII_CONTROL);
		else if(m_enabledTokenTypes[TT_UNICODE_CONTROL]	// Unicode 䕶
				&& IsUnicodeControl(pwsz + i, cch - i)) {
			cchToken = 1;
			token.SetType(TT_UNICODE_CONTROL);
		} else if(m_enabledTokenTypes[TT_IDENTIFIER]	// ʎq or L[[h (or )
				&& 0 != (cchToken = IsIdentifier(pwsz + i, cch - i))) {
			if(m_keywords.empty())
				token.SetType(TT_IDENTIFIER);
			else if(IsKeyword(string_t(pwsz + i, cchToken), nTokenCookie)) {
				token.SetCookie(nTokenCookie);
				token.SetType(TT_KEYWORD);
			} else
				token.SetType(TT_IDENTIFIER);
		} else {	// 
			CodePoint	cp;
			cchToken = 0;
			while(i + cchToken < cch) {
				cp = DecodeUTF16SurrogatePairToCodePoint(pwsz + i, cch - i);
				cchToken += (cp > 0xFFFF) ? 2 : 1;
				if(CBoundarySearcher::IsFirstCharacterOfCluster(cp))
					break;
			}
			token.SetType(TT_UNSPECIFIED);
		}

		if(tokens.empty() || token.GetType() != TT_UNSPECIFIED || token.GetType() != tokens.back().GetType()
				|| (!tokens.empty() && i - tokens.back().GetIndex() == 1	// LȊʂ1g[N
				&& wcschr(m_pwszBrackets, pwsz[tokens.back().GetIndex()]) != 0)) {
			token.m_i = i;
			tokens.push_back(token);
			nTokenCookie = NullCookie;
			token.SetCookie(NullCookie);
		} else
			/* Aȃ̂߂Ɏނނ̃g[N1ɂ܂Ƃ߂ */;
		i += cchToken;

		if(token.GetType() != TT_WHITESPACE && token.GetType() != TT_TAB)
			bAppearedUnspace = true;
	}
}

/**
 *	̕s߂̏Ԃ𒲂ׂ
 *	@param str		
 *	@param nCookie	ׂ镶̒O瑱sRgɊ֘AtꂽNbL[B
 *					s߂łȂꍇ NullCookie
 *	@return			ׂ񂪕sIĂȂ΁A
 *					̕s߂Ɋ֘AtꂽNbL[BłȂ NullCookie
 */
TokenCookie CLexer::ParseMultilineAnnotation(const string_t& str, TokenCookie nCookie) const {
	AssertValid();

	if(str.empty())
		return nCookie;

	const char_t*	pwsz = str.data();
	const length_t	cch = str.length();
	length_t		i = 0;
	length_t		cchToken;
	TokenCookie		nDummy;
	bool			bMCommentContinued;

	// s߂O̍s瑱Ăꍇ
	if(nCookie != NullCookie) {
		i = IsMultilineAnnotation(pwsz, cch, true, nCookie, bMCommentContinued);
		if(bMCommentContinued)
			return nCookie;
		nCookie = NullCookie;
	}

	while(i < cch) {
		if(m_enabledTokenTypes[TT_ANNOTATION]
				&& 0 != (cchToken = IsSinglelineAnnotation(pwsz + i, cch - i, i == 0, nDummy))) {
			return NullCookie;
		} else if(m_enabledTokenTypes[TT_ANNOTATION]
				&& 0 != (cchToken = IsMultilineAnnotation(pwsz + i,
					cch - i, i == 0, nCookie, bMCommentContinued))) {
			if(bMCommentContinued)
				return nCookie;
			nCookie = NullCookie;
			i += cchToken;
		} else if(m_enabledTokenTypes[TT_SINGLEQUOTATION]
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\'')))
			i += cchToken;
		else if(m_enabledTokenTypes[TT_DOUBLEQUOTATION]
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\"')))
			i += cchToken;
		else if(m_enabledTokenTypes[TT_OTHERQUOTATION]
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, pwsz[i])))
			i += cchToken;
		else
			++i;
	}
	return NullCookie;
}

///	o^ĂL[[hA߁AZqSč폜
void CLexer::RemoveAll() {
	AssertValid();

	m_keywords.clear();
	m_multilineAnnotations.clear();
	m_singlelineAnnotationBs.clear();
	m_singlelineAnnotationDs.clear();
	m_operators.clear();
	NotifyChange();
}

/**
 *	AddXXXX œo^g[N폜
 *	@param nCookie	g[ÑNbL[l
 */
void CLexer::RemoveIdentifiedToken(TokenCookie nCookie) throw(invalid_argument) {
	AssertValid();

	TokenType	type = TT_COUNT;

	KeywordsMap::iterator	itKeywords = m_keywords.find(nCookie);
	if(itKeywords != m_keywords.end()) {
		m_keywords.erase(itKeywords);
		type = TT_KEYWORD;
	} else {
		MAnnotationMap::iterator	itMAnnotations = m_multilineAnnotations.find(nCookie);
		if(itMAnnotations != m_multilineAnnotations.end()) {
			m_multilineAnnotations.erase(itMAnnotations);
			type = TT_ANNOTATION;
		} else {
			SAnnotationBMap::iterator	itSBAnnotations = m_singlelineAnnotationBs.find(nCookie);
			if(itSBAnnotations != m_singlelineAnnotationBs.end()) {
				m_singlelineAnnotationBs.erase(itSBAnnotations);
				type = TT_ANNOTATION;
			} else {
				SAnnotationDMap::iterator	itSDAnnotations = m_singlelineAnnotationDs.find(nCookie);
				if(itSDAnnotations != m_singlelineAnnotationDs.end()) {
					m_singlelineAnnotationDs.erase(itSDAnnotations);
					type = TT_ANNOTATION;
				}
			}
		}
	}
	if(type == TT_COUNT)
		throw invalid_argument("Specified cookie value is invalid.");
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerRemovedIdentifiedToken(type, nCookie);
	NotifyChange();
}

///	SĂ̐ݒ荀ڂԂɖ߂
void CLexer::Reset() {
	AssertValid();

	m_bCaseSensitive = true;
	m_bEnableUnicodeAlphabets = true;
	m_bEnableUnicodeWhiteSpaces = true;
	m_numberFormat = NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL;
	fill(m_enabledTokenTypes, m_enabledTokenTypes + _countof(m_enabledTokenTypes), true);
	RemoveAll();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param pwszAlphabets	ׂBLȃTQ[gyA͔ BMP ɕϊ
 *	@param cch				<var>pwszAlphabets</var> ̕
 */
void CLexer::SetAdditionalAlphabets(const char_t* pwszAlphabets, length_t cch) {
	AssertValid();

	m_additionalAlphabets.clear();
	for(length_t i = 0; i < cch; ++i) {
		const CodePoint	cp = DecodeUTF16SurrogatePairToCodePoint(pwszAlphabets + i, cch - i);
		m_additionalAlphabets.insert(cp);
		if(cp > 0xFFFF)
			++i;
	}
	NotifyChange();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param alphabets	R[h|Cg̏W
 */
void CLexer::SetAdditionalAlphabets(const set<CodePoint>& alphabets) {
	AssertValid();
	m_additionalAlphabets = alphabets;
	NotifyChange();
}

/**
 *	ʂƂĎgp镶ݒ肷
 *	@param pwszBrackets				Jʂׂ
 *	@throw std::invalid_argument	JʂƂĎgpłȂɊ܂܂ĂƂX[
 */
void CLexer::SetBrackets(const char_t* pwszBrackets) {
	AssertValid();
	assert(pwszBrackets != 0);

	ostringstream_t	ss;

	for(size_t i = 0; pwszBrackets[i] != 0; ++i) {
		for(size_t j = 0; ; ++j) {
			if(bracketPairs[j].first == 0 || bracketPairs[j].second == pwszBrackets[i])
				throw invalid_argument("Specified character can not be used as an opener.");
			else if(bracketPairs[j].first == pwszBrackets[i]) {
				ss << bracketPairs[j].first << bracketPairs[j].second;
				break;
			}
		}
	}
	const string_t	str = ss.str();
	delete[] m_pwszBrackets;
	m_pwszBrackets = new char_t[str.length() + 1];
	wcscpy(m_pwszBrackets, str.c_str());
}

/**
 *	Zqݒ肷
 *	@param operators	Zq̏W
 */
void CLexer::SetOperators(const set<string_t>& operators) {
	AssertValid();

	Private::TKeywordComparer::bCaseSensitive = m_bCaseSensitive;
	m_operators.clear();
	for(set<string_t>::const_iterator it = operators.begin(); it != operators.end(); ++it) {
		if(it->empty())
			continue;
		const char_t	chKey = m_bCaseSensitive ? it->at(0) : LOWORD(::CharLowerW(reinterpret_cast<char_t*>(it->at(0))));
		m_operators[chKey].insert(*it);
	}
	NotifyChange();
}

/* [EOF] */