/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- * ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Novell code. * * The Initial Developer of the Original Code is Novell Corporation. * Portions created by the Initial Developer are Copyright (C) 2006 * the Initial Developer. All Rights Reserved. * * Contributor(s): * robert@ocallahan.org * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nsTextFrameUtils.h" #include "nsContentUtils.h" #include "nsIWordBreaker.h" #include "gfxFont.h" #include "nsTextTransformer.h" #include "nsCompressedCharMap.h" #include "nsUnicharUtils.h" // XXX TODO implement transform of backslash to yen that nsTextTransform does // when requested by PresContext->LanguageSpecificTransformType(). Do it with // a new factory type that just munges the input stream. But first, check // that we really still need this, it's only enabled via a hidden pref // which defaults false... // Replaced by precompiled CCMap (see bug 180266). To update the list // of characters, see one of files included below. As for the way // the original list of characters was obtained by Frank Tang, see bug 54467. // Updated to fix the regression (bug 263411). The list contains // characters of the following Unicode character classes : Ps, Pi, Po, Pf, Pe. // (ref.: http://www.w3.org/TR/2004/CR-CSS21-20040225/selector.html#first-letter) // Note that the file does NOT yet include non-BMP characters because // there's no point including them without fixing the way we identify // 'first-letter' currently working only with BMP characters. #include "punct_marks.ccmap" DEFINE_CCMAP(gPuncCharsCCMap, const); PRBool nsTextFrameUtils::IsPunctuationMark(PRUnichar aChar) { return CCMAP_HAS_CHAR(gPuncCharsCCMap, aChar); } static PRBool IsDiscardable(PRUnichar ch, PRUint32* aFlags) { // Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by gfxTextRun // and discarding it would force us to copy text in many cases of preformatted // text containing \r\n. if (ch == CH_SHY) { *aFlags |= nsTextFrameUtils::TEXT_HAS_SHY; return PR_TRUE; } if ((ch & 0xFF00) != 0x2000) { // Not a Bidi control character return PR_FALSE; } return IS_BIDI_CONTROL(ch); } static PRBool IsDiscardable(PRUint8 ch, PRUint32* aFlags) { if (ch == CH_SHY) { *aFlags |= nsTextFrameUtils::TEXT_HAS_SHY; return PR_TRUE; } return PR_FALSE; } PRUnichar* nsTextFrameUtils::TransformText(const PRUnichar* aText, PRUint32 aLength, PRUnichar* aOutput, PRBool aCompressWhitespace, PRPackedBool* aIncomingWhitespace, gfxSkipCharsBuilder* aSkipChars, PRUint32* aAnalysisFlags) { // We're just going to assume this! PRUint32 flags = TEXT_HAS_NON_ASCII; PRUnichar* outputStart = aOutput; if (!aCompressWhitespace) { // Convert tabs to spaces and skip discardables. PRUint32 i; for (i = 0; i < aLength; ++i) { PRUnichar ch = *aText++; if (ch == '\t') { flags |= TEXT_HAS_TAB|TEXT_WAS_TRANSFORMED; aSkipChars->KeepChar(); *aOutput++ = ' '; } else if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); } else { aSkipChars->KeepChar(); if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } else if (IS_SURROGATE(ch)) { flags |= gfxTextRunFactory::TEXT_HAS_SURROGATES; } *aOutput++ = ch; } } } else { PRBool inWhitespace = *aIncomingWhitespace; PRUint32 i; for (i = 0; i < aLength; ++i) { PRUnichar ch = *aText++; PRBool nowInWhitespace; if (ch == ' ' && (i + 1 >= aLength || !IsSpaceCombiningSequenceTail(&aText[1], aLength - (i + 1)))) { nowInWhitespace = PR_TRUE; } else if (ch == '\n') { if (i > 0 && IS_CJ_CHAR(aText[-1]) && i + 1 < aLength && IS_CJ_CHAR(aText[1])) { // Discard newlines between CJK chars. // XXX this really requires more context to get right! aSkipChars->SkipChar(); continue; } nowInWhitespace = PR_TRUE; } else { nowInWhitespace = ch == '\t'; } if (!nowInWhitespace) { if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); nowInWhitespace = inWhitespace; } else { if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } else if (IS_SURROGATE(ch)) { flags |= gfxTextRunFactory::TEXT_HAS_SURROGATES; } *aOutput++ = ch; aSkipChars->KeepChar(); } } else { if (inWhitespace) { aSkipChars->SkipChar(); } else { if (ch != ' ') { flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ' '; aSkipChars->KeepChar(); } } inWhitespace = nowInWhitespace; } } if (outputStart < aOutput) { *aIncomingWhitespace = aOutput[-1] == ' '; } if (outputStart + aLength != aOutput) { flags |= TEXT_WAS_TRANSFORMED; } *aAnalysisFlags = flags; return aOutput; } PRUint8* nsTextFrameUtils::TransformText(const PRUint8* aText, PRUint32 aLength, PRUint8* aOutput, PRBool aCompressWhitespace, PRPackedBool* aIncomingWhitespace, gfxSkipCharsBuilder* aSkipChars, PRUint32* aAnalysisFlags) { PRUint32 flags = 0; PRUint8 allBits = 0; PRUint8* outputStart = aOutput; if (!aCompressWhitespace) { // Convert tabs to spaces and skip discardables. PRUint32 i; for (i = 0; i < aLength; ++i) { PRUint8 ch = *aText++; allBits |= ch; if (ch == '\t') { flags |= TEXT_HAS_TAB|TEXT_WAS_TRANSFORMED; aSkipChars->KeepChar(); *aOutput++ = ' '; } else if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); } else { aSkipChars->KeepChar(); if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ch; } } } else { PRBool inWhitespace = *aIncomingWhitespace; PRUint32 i; for (i = 0; i < aLength; ++i) { PRUint8 ch = *aText++; allBits |= ch; PRBool nowInWhitespace = ch == ' ' || ch == '\t' || ch == '\n'; if (!nowInWhitespace) { if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); nowInWhitespace = inWhitespace; } else { if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ch; aSkipChars->KeepChar(); } } else { if (inWhitespace) { aSkipChars->SkipChar(); } else { if (ch != ' ') { flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ' '; aSkipChars->KeepChar(); } } inWhitespace = nowInWhitespace; } } if (outputStart < aOutput) { *aIncomingWhitespace = aOutput[-1] == ' '; } if (outputStart + aLength != aOutput) { flags |= TEXT_WAS_TRANSFORMED; } if (allBits & 0x80) { flags |= TEXT_HAS_NON_ASCII; } *aAnalysisFlags = flags; return aOutput; } // TODO The wordbreaker needs to be fixed. It's buggy, for example, it doesn't // handle diacriticals combined with spaces enum SimpleCharClass { CLASS_ALNUM, CLASS_PUNCT, CLASS_SPACE }; // This is what nsSampleWordBreaker::GetClass considers whitespace static PRBool IsWordBreakerWhitespace(const PRUnichar* aChars, PRInt32 aLength) { PRUnichar ch = aChars[0]; if (ch == '\t' || ch == '\n' || ch == '\r') return PR_TRUE; if (ch == ' ' && !nsTextFrameUtils::IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1)) return PR_TRUE; return PR_FALSE; } // like nsSampleWordBreaker::GetClass static SimpleCharClass Classify8BitChar(PRUint8 aChar) { if (aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == '\r') return CLASS_SPACE; if ((aChar >= 'a' && aChar <= 'z') || (aChar >= 'A' || aChar <= 'Z') || (aChar >= '0' && aChar <= '9') || (aChar >= 128)) return CLASS_ALNUM; return CLASS_PUNCT; } PRInt32 nsTextFrameUtils::FindWordBoundary(const nsTextFragment* aText, gfxTextRun* aTextRun, gfxSkipCharsIterator* aIterator, PRInt32 aOffset, PRInt32 aLength, PRInt32 aPosition, PRInt32 aDirection, PRBool aBreakBeforePunctuation, PRBool aBreakAfterPunctuation, PRBool* aWordIsWhitespace) { // A space followed by combining diacritical marks is not whitespace!! PRInt32 textLength = aText->GetLength(); *aWordIsWhitespace = aText->Is2b() ? IsWordBreakerWhitespace(aText->Get2b() + aPosition, textLength - aPosition) : Classify8BitChar(aText->Get1b()[aPosition]) == CLASS_SPACE; PRInt32 nextWordPos; // first character of next/prev "word" if (aText->Is2b()) { nsIWordBreaker* wordBreaker = nsContentUtils::WordBreaker(); const PRUnichar* text = aText->Get2b(); PRInt32 pos = aPosition; // XXX the wordbreaker currently isn't cluster-aware. We need to make // it cluster-aware. In the meantime, just reject any word breaks // inside clusters. for (;;) { nextWordPos = aDirection > 0 ? wordBreaker->NextWord(text, textLength, pos) : wordBreaker->PrevWord(text, textLength, pos); if (nextWordPos < 0) break; if (aTextRun->GetCharFlags(aIterator->ConvertOriginalToSkipped(nextWordPos)) & gfxTextRun::CLUSTER_START) break; pos = nextWordPos; } } else { const char* text = aText->Get1b(); SimpleCharClass cl = Classify8BitChar(text[aPosition]); nextWordPos = aPosition; // There shouldn't be any clusters in 8bit text but we'll cover that // possibility anyway do { nextWordPos += aDirection; if (nextWordPos < aOffset || nextWordPos >= aOffset + aLength) { nextWordPos = NS_WORDBREAKER_NEED_MORE_TEXT; break; } } while (Classify8BitChar(text[nextWordPos]) == cl || !(aTextRun->GetCharFlags(aIterator->ConvertOriginalToSkipped(nextWordPos)) & gfxTextRun::CLUSTER_START)); } // Handle punctuation breaks PRInt32 i; PRBool punctPrev = IsPunctuationMark(aText->CharAt(aPosition)); for (i = aPosition + aDirection; i != nextWordPos && i >= aOffset && i < aOffset + aLength; i += aDirection) { // See if there's a punctuation break between i-aDirection and i PRBool punct = IsPunctuationMark(aText->CharAt(i)); if (punct != punctPrev && (aTextRun->GetCharFlags(aIterator->ConvertOriginalToSkipped(i)) & gfxTextRun::CLUSTER_START)) { PRBool punctIsBefore = aDirection < 0 ? punct : punctPrev; if (punctIsBefore ? aBreakAfterPunctuation : aBreakBeforePunctuation) break; } punctPrev = punct; } if (i < aOffset || i >= aOffset + aLength) return -1; return i; } PRBool nsSkipCharsRunIterator::NextRun() { do { if (!mRemainingLength) return PR_FALSE; if (mRunLength) { mIterator.AdvanceOriginal(mRunLength); NS_ASSERTION(mRunLength > 0, "No characters in run (initial length too large?)"); if (!mSkipped || mLengthIncludesSkipped) { mRemainingLength -= mRunLength; } } PRInt32 length; mSkipped = mIterator.IsOriginalCharSkipped(&length); mRunLength = PR_MIN(length, mRemainingLength); } while (!mVisitSkipped && mSkipped); return PR_TRUE; }