using System; using System.Collections.Generic; using System.Text; namespace CodeConverter.Tests { #if NR6 public #endif enum UnicodeNewline { Unknown, /// /// Line Feed, U+000A /// LF = 0x0A, CRLF = 0x0D0A, /// /// Carriage Return, U+000D /// CR = 0x0D, /// /// Next Line, U+0085 /// NEL = 0x85, /// /// Vertical Tab, U+000B /// VT = 0x0B, /// /// Form Feed, U+000C /// FF = 0x0C, /// /// Line Separator, U+2028 /// LS = 0x2028, /// /// Paragraph Separator, U+2029 /// PS = 0x2029 } /// /// Defines unicode new lines according to Unicode Technical Report #13 /// http://www.unicode.org/standard/reports/tr13/tr13-5.html /// #if NR6 public #endif static class NewLine { /// /// Carriage Return, U+000D /// public const char CR = (char)0x0D; /// /// Line Feed, U+000A /// public const char LF = (char)0x0A; /// /// Next Line, U+0085 /// public const char NEL = (char)0x85; /// /// Vertical Tab, U+000B /// public const char VT = (char)0x0B; /// /// Form Feed, U+000C /// public const char FF = (char)0x0C; /// /// Line Separator, U+2028 /// public const char LS = (char)0x2028; /// /// Paragraph Separator, U+2029 /// public const char PS = (char)0x2029; /// /// Determines if a char is a new line delimiter. /// /// 0 == no new line, otherwise it returns either 1 or 2 depending of the length of the delimiter. /// The current character. /// A callback getting the next character (may be null). public static int GetDelimiterLength(char curChar, Func nextChar = null) { if (curChar == CR) { if (nextChar != null && nextChar() == LF) return 2; return 1; } if (curChar == LF || curChar == NEL || curChar == VT || curChar == FF || curChar == LS || curChar == PS) return 1; return 0; } /// /// Determines if a char is a new line delimiter. /// /// 0 == no new line, otherwise it returns either 1 or 2 depending of the length of the delimiter. /// The current character. /// The next character (if != LF then length will always be 0 or 1). public static int GetDelimiterLength(char curChar, char nextChar) { if (curChar == CR) { if (nextChar == LF) return 2; return 1; } if (curChar == LF || curChar == NEL || curChar == VT || curChar == FF || curChar == LS || curChar == PS) return 1; return 0; } /// /// Determines if a char is a new line delimiter. /// /// 0 == no new line, otherwise it returns either 1 or 2 depending of the length of the delimiter. /// The current character. /// The length of the delimiter /// The type of the delimiter /// A callback getting the next character (may be null). public static bool TryGetDelimiterLengthAndType(char curChar, out int length, out UnicodeNewline type, Func nextChar = null) { if (curChar == CR) { if (nextChar != null && nextChar() == LF) { length = 2; type = UnicodeNewline.CRLF; } else { length = 1; type = UnicodeNewline.CR; } return true; } switch (curChar) { case LF: type = UnicodeNewline.LF; length = 1; return true; case NEL: type = UnicodeNewline.NEL; length = 1; return true; case VT: type = UnicodeNewline.VT; length = 1; return true; case FF: type = UnicodeNewline.FF; length = 1; return true; case LS: type = UnicodeNewline.LS; length = 1; return true; case PS: type = UnicodeNewline.PS; length = 1; return true; } length = -1; type = UnicodeNewline.Unknown; return false; } /// /// Determines if a char is a new line delimiter. /// /// 0 == no new line, otherwise it returns either 1 or 2 depending of the length of the delimiter. /// The current character. /// The length of the delimiter /// The type of the delimiter /// The next character (if != LF then length will always be 0 or 1). public static bool TryGetDelimiterLengthAndType(char curChar, out int length, out UnicodeNewline type, char nextChar) { if (curChar == CR) { if (nextChar == LF) { length = 2; type = UnicodeNewline.CRLF; } else { length = 1; type = UnicodeNewline.CR; } return true; } switch (curChar) { case LF: type = UnicodeNewline.LF; length = 1; return true; case NEL: type = UnicodeNewline.NEL; length = 1; return true; case VT: type = UnicodeNewline.VT; length = 1; return true; case FF: type = UnicodeNewline.FF; length = 1; return true; case LS: type = UnicodeNewline.LS; length = 1; return true; case PS: type = UnicodeNewline.PS; length = 1; return true; } length = -1; type = UnicodeNewline.Unknown; return false; } /// /// Gets the new line type of a given char/next char. /// /// 0 == no new line, otherwise it returns either 1 or 2 depending of the length of the delimiter. /// The current character. /// A callback getting the next character (may be null). public static UnicodeNewline GetDelimiterType(char curChar, Func nextChar = null) { switch (curChar) { case CR: if (nextChar != null && nextChar() == LF) return UnicodeNewline.CRLF; return UnicodeNewline.CR; case LF: return UnicodeNewline.LF; case NEL: return UnicodeNewline.NEL; case VT: return UnicodeNewline.VT; case FF: return UnicodeNewline.FF; case LS: return UnicodeNewline.LS; case PS: return UnicodeNewline.PS; } return UnicodeNewline.Unknown; } /// /// Gets the new line type of a given char/next char. /// /// 0 == no new line, otherwise it returns either 1 or 2 depending of the length of the delimiter. /// The current character. /// The next character (if != LF then length will always be 0 or 1). public static UnicodeNewline GetDelimiterType(char curChar, char nextChar) { switch (curChar) { case CR: if (nextChar == LF) return UnicodeNewline.CRLF; return UnicodeNewline.CR; case LF: return UnicodeNewline.LF; case NEL: return UnicodeNewline.NEL; case VT: return UnicodeNewline.VT; case FF: return UnicodeNewline.FF; case LS: return UnicodeNewline.LS; case PS: return UnicodeNewline.PS; } return UnicodeNewline.Unknown; } /// /// Determines if a char is a new line delimiter. /// /// Note that the only 2 char wide new line is CR LF and both chars are new line /// chars on their own. For most cases GetDelimiterLength is the better choice. /// public static bool IsNewLine(char ch) { return ch == NewLine.CR || ch == NewLine.LF || ch == NewLine.NEL || ch == NewLine.VT || ch == NewLine.FF || ch == NewLine.LS || ch == NewLine.PS; } /// /// Gets the new line as a string. /// public static string GetString(UnicodeNewline newLine) { switch (newLine) { case UnicodeNewline.Unknown: return ""; case UnicodeNewline.LF: return "\n"; case UnicodeNewline.CRLF: return "\r\n"; case UnicodeNewline.CR: return "\r"; case UnicodeNewline.NEL: return "\u0085"; case UnicodeNewline.VT: return "\u000B"; case UnicodeNewline.FF: return "\u000C"; case UnicodeNewline.LS: return "\u2028"; case UnicodeNewline.PS: return "\u2029"; default: throw new ArgumentOutOfRangeException(); } } public static string[] SplitLines(string text) { var result = new List(); var sb = new StringBuilder(); int length; UnicodeNewline type; for (int i = 0; i < text.Length; i++) { char ch = text[i]; if (TryGetDelimiterLengthAndType(ch, out length, out type, () => i < text.Length - 1 ? text[i + 1] : '\0')) { result.Add(sb.ToString()); sb.Length = 0; i += length - 1; continue; } sb.Append(ch); } if (sb.Length > 0) result.Add(sb.ToString()); return result.ToArray(); } } }