gecko-dev/js/public/String.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * vim: set ts=8 sts=2 et sw=2 tw=80:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/* JavaScript string operations. */

#ifndef js_String_h
#define js_String_h

#include "js/shadow/String.h"  // JS::shadow::String

#include "mozilla/Assertions.h"  // MOZ_ASSERT
#include "mozilla/Attributes.h"  // MOZ_ALWAYS_INLINE
#include "mozilla/Likely.h"      // MOZ_LIKELY
#include "mozilla/Maybe.h"       // mozilla::Maybe
#include "mozilla/Range.h"       // mozilla::Range
#include "mozilla/Span.h"        // mozilla::Span
#include "mozilla/Tuple.h"       // mozilla::Tuple

#include <algorithm>  // std::copy_n
#include <stddef.h>   // size_t
#include <stdint.h>   // uint32_t, uint64_t, INT32_MAX

#include "jstypes.h"  // JS_PUBLIC_API

#include "js/CharacterEncoding.h"  // JS::UTF8Chars, JS::ConstUTF8CharsZ
#include "js/Id.h"                 // jsid, JSID_IS_STRING, JSID_TO_STRING
#include "js/RootingAPI.h"         // JS::Handle
#include "js/TypeDecls.h"          // JS::Latin1Char
#include "js/UniquePtr.h"          // JS::UniquePtr
#include "js/Utility.h"            // JS::FreePolicy, JS::UniqueTwoByteChars
#include "js/Value.h"              // JS::Value

struct JS_PUBLIC_API JSContext;
class JS_PUBLIC_API JSAtom;
class JSLinearString;
class JS_PUBLIC_API JSString;

namespace JS {

class JS_PUBLIC_API AutoRequireNoGC;

}  // namespace JS

extern JS_PUBLIC_API JSString* JS_GetEmptyString(JSContext* cx);

// Don't want to export data, so provide accessors for non-inline Values.
extern JS_PUBLIC_API JS::Value JS_GetEmptyStringValue(JSContext* cx);

/*
 * String creation.
 *
 * NB: JS_NewUCString takes ownership of bytes on success, avoiding a copy;
 * but on error (signified by null return), it leaves chars owned by the
 * caller. So the caller must free bytes in the error case, if it has no use
 * for them. In contrast, all the JS_New*StringCopy* functions do not take
 * ownership of the character memory passed to them -- they copy it.
 */

extern JS_PUBLIC_API JSString* JS_NewStringCopyN(JSContext* cx, const char* s,
                                                 size_t n);

extern JS_PUBLIC_API JSString* JS_NewStringCopyZ(JSContext* cx, const char* s);

extern JS_PUBLIC_API JSString* JS_NewStringCopyUTF8Z(
    JSContext* cx, const JS::ConstUTF8CharsZ s);

extern JS_PUBLIC_API JSString* JS_NewStringCopyUTF8N(JSContext* cx,
                                                     const JS::UTF8Chars s);

extern JS_PUBLIC_API JSString* JS_AtomizeStringN(JSContext* cx, const char* s,
                                                 size_t length);

extern JS_PUBLIC_API JSString* JS_AtomizeString(JSContext* cx, const char* s);

// Note: unlike the non-pinning JS_Atomize* functions, this can be called
// without entering a realm/zone.
extern JS_PUBLIC_API JSString* JS_AtomizeAndPinStringN(JSContext* cx,
                                                       const char* s,
                                                       size_t length);

// Note: unlike the non-pinning JS_Atomize* functions, this can be called
// without entering a realm/zone.
extern JS_PUBLIC_API JSString* JS_AtomizeAndPinString(JSContext* cx,
                                                      const char* s);

extern JS_PUBLIC_API JSString* JS_NewLatin1String(
    JSContext* cx, js::UniquePtr<JS::Latin1Char[], JS::FreePolicy> chars,
    size_t length);

extern JS_PUBLIC_API JSString* JS_NewUCString(JSContext* cx,
                                              JS::UniqueTwoByteChars chars,
                                              size_t length);

extern JS_PUBLIC_API JSString* JS_NewUCStringDontDeflate(
    JSContext* cx, JS::UniqueTwoByteChars chars, size_t length);

extern JS_PUBLIC_API JSString* JS_NewUCStringCopyN(JSContext* cx,
                                                   const char16_t* s, size_t n);

extern JS_PUBLIC_API JSString* JS_NewUCStringCopyZ(JSContext* cx,
                                                   const char16_t* s);

extern JS_PUBLIC_API JSString* JS_AtomizeUCStringN(JSContext* cx,
                                                   const char16_t* s,
                                                   size_t length);

extern JS_PUBLIC_API JSString* JS_AtomizeUCString(JSContext* cx,
                                                  const char16_t* s);

extern JS_PUBLIC_API bool JS_CompareStrings(JSContext* cx, JSString* str1,
                                            JSString* str2, int32_t* result);

[[nodiscard]] extern JS_PUBLIC_API bool JS_StringEqualsAscii(
    JSContext* cx, JSString* str, const char* asciiBytes, bool* match);

// Same as above, but when the length of asciiBytes (excluding the
// trailing null, if any) is known.
[[nodiscard]] extern JS_PUBLIC_API bool JS_StringEqualsAscii(
    JSContext* cx, JSString* str, const char* asciiBytes, size_t length,
    bool* match);

template <size_t N>
[[nodiscard]] bool JS_StringEqualsLiteral(JSContext* cx, JSString* str,
                                          const char (&asciiBytes)[N],
                                          bool* match) {
  MOZ_ASSERT(asciiBytes[N - 1] == '\0');
  return JS_StringEqualsAscii(cx, str, asciiBytes, N - 1, match);
}

extern JS_PUBLIC_API size_t JS_PutEscapedString(JSContext* cx, char* buffer,
                                                size_t size, JSString* str,
                                                char quote);

/*
 * Extracting string characters and length.
 *
 * While getting the length of a string is infallible, getting the chars can
 * fail. As indicated by the lack of a JSContext parameter, there are two
 * special cases where getting the chars is infallible:
 *
 * The first case is for strings that have been atomized, e.g. directly by
 * JS_AtomizeAndPinString or implicitly because it is stored in a jsid.
 *
 * The second case is "linear" strings that have been explicitly prepared in a
 * fallible context by JS_EnsureLinearString. To catch errors, a separate opaque
 * JSLinearString type is returned by JS_EnsureLinearString and expected by
 * JS_Get{Latin1,TwoByte}StringCharsAndLength. Note, though, that this is purely
 * a syntactic distinction: the input and output of JS_EnsureLinearString are
 * the same actual GC-thing. If a JSString is known to be linear,
 * JS_ASSERT_STRING_IS_LINEAR can be used to make a debug-checked cast. Example:
 *
 *   // In a fallible context.
 *   JSLinearString* lstr = JS_EnsureLinearString(cx, str);
 *   if (!lstr) {
 *     return false;
 *   }
 *   MOZ_ASSERT(lstr == JS_ASSERT_STRING_IS_LINEAR(str));
 *
 *   // In an infallible context, for the same 'str'.
 *   AutoCheckCannotGC nogc;
 *   const char16_t* chars = JS::GetTwoByteLinearStringChars(nogc, lstr)
 *   MOZ_ASSERT(chars);
 *
 * Note: JS strings (including linear strings and atoms) are not
 * null-terminated!
 *
 * Additionally, string characters are stored as either Latin1Char (8-bit)
 * or char16_t (16-bit). Clients can use JS::StringHasLatin1Chars and can then
 * call either the Latin1* or TwoByte* functions. Some functions like
 * JS_CopyStringChars and JS_GetStringCharAt accept both Latin1 and TwoByte
 * strings.
 */

extern JS_PUBLIC_API size_t JS_GetStringLength(JSString* str);

extern JS_PUBLIC_API bool JS_StringIsLinear(JSString* str);

extern JS_PUBLIC_API const JS::Latin1Char* JS_GetLatin1StringCharsAndLength(
    JSContext* cx, const JS::AutoRequireNoGC& nogc, JSString* str,
    size_t* length);

extern JS_PUBLIC_API const char16_t* JS_GetTwoByteStringCharsAndLength(
    JSContext* cx, const JS::AutoRequireNoGC& nogc, JSString* str,
    size_t* length);

extern JS_PUBLIC_API bool JS_GetStringCharAt(JSContext* cx, JSString* str,
                                             size_t index, char16_t* res);

extern JS_PUBLIC_API const char16_t* JS_GetTwoByteExternalStringChars(
    JSString* str);

extern JS_PUBLIC_API bool JS_CopyStringChars(JSContext* cx,
                                             mozilla::Range<char16_t> dest,
                                             JSString* str);

/**
 * Copies the string's characters to a null-terminated char16_t buffer.
 *
 * Returns nullptr on OOM.
 */
extern JS_PUBLIC_API JS::UniqueTwoByteChars JS_CopyStringCharsZ(JSContext* cx,
                                                                JSString* str);

extern JS_PUBLIC_API JSLinearString* JS_EnsureLinearString(JSContext* cx,
                                                           JSString* str);

static MOZ_ALWAYS_INLINE JSLinearString* JS_ASSERT_STRING_IS_LINEAR(
    JSString* str) {
  MOZ_ASSERT(JS_StringIsLinear(str));
  return reinterpret_cast<JSLinearString*>(str);
}

static MOZ_ALWAYS_INLINE JSString* JS_FORGET_STRING_LINEARNESS(
    JSLinearString* str) {
  return reinterpret_cast<JSString*>(str);
}

/*
 * Additional APIs that avoid fallibility when given a linear string.
 */

extern JS_PUBLIC_API bool JS_LinearStringEqualsAscii(JSLinearString* str,
                                                     const char* asciiBytes);
extern JS_PUBLIC_API bool JS_LinearStringEqualsAscii(JSLinearString* str,
                                                     const char* asciiBytes,
                                                     size_t length);

template <size_t N>
bool JS_LinearStringEqualsLiteral(JSLinearString* str,
                                  const char (&asciiBytes)[N]) {
  MOZ_ASSERT(asciiBytes[N - 1] == '\0');
  return JS_LinearStringEqualsAscii(str, asciiBytes, N - 1);
}

extern JS_PUBLIC_API size_t JS_PutEscapedLinearString(char* buffer, size_t size,
                                                      JSLinearString* str,
                                                      char quote);

/**
 * Create a dependent string, i.e., a string that owns no character storage,
 * but that refers to a slice of another string's chars.  Dependent strings
 * are mutable by definition, so the thread safety comments above apply.
 */
extern JS_PUBLIC_API JSString* JS_NewDependentString(JSContext* cx,
                                                     JS::Handle<JSString*> str,
                                                     size_t start,
                                                     size_t length);

/**
 * Concatenate two strings, possibly resulting in a rope.
 * See above for thread safety comments.
 */
extern JS_PUBLIC_API JSString* JS_ConcatStrings(JSContext* cx,
                                                JS::Handle<JSString*> left,
                                                JS::Handle<JSString*> right);

/**
 * For JS_DecodeBytes, set *dstlenp to the size of the destination buffer before
 * the call; on return, *dstlenp contains the number of characters actually
 * stored. To determine the necessary destination buffer size, make a sizing
 * call that passes nullptr for dst.
 *
 * On errors, the functions report the error. In that case, *dstlenp contains
 * the number of characters or bytes transferred so far.  If cx is nullptr, no
 * error is reported on failure, and the functions simply return false.
 *
 * NB: This function does not store an additional zero byte or char16_t after
 * the transcoded string.
 */
JS_PUBLIC_API bool JS_DecodeBytes(JSContext* cx, const char* src, size_t srclen,
                                  char16_t* dst, size_t* dstlenp);

/**
 * Get number of bytes in the string encoding (without accounting for a
 * terminating zero bytes. The function returns (size_t) -1 if the string
 * can not be encoded into bytes and reports an error using cx accordingly.
 */
JS_PUBLIC_API size_t JS_GetStringEncodingLength(JSContext* cx, JSString* str);

/**
 * Encode string into a buffer. The function does not stores an additional
 * zero byte. The function returns (size_t) -1 if the string can not be
 * encoded into bytes with no error reported. Otherwise it returns the number
 * of bytes that are necessary to encode the string. If that exceeds the
 * length parameter, the string will be cut and only length bytes will be
 * written into the buffer.
 */
[[nodiscard]] JS_PUBLIC_API bool JS_EncodeStringToBuffer(JSContext* cx,
                                                         JSString* str,
                                                         char* buffer,
                                                         size_t length);

/**
 * Encode as many scalar values of the string as UTF-8 as can fit
 * into the caller-provided buffer replacing unpaired surrogates
 * with the REPLACEMENT CHARACTER.
 *
 * If JS::StringHasLatin1Chars(str) returns true, the function
 * is guaranteed to convert the entire string if
 * buffer.Length() >= 2 * JS_GetStringLength(str). Otherwise,
 * the function is guaranteed to convert the entire string if
 * buffer.Length() >= 3 * JS_GetStringLength(str).
 *
 * This function does not alter the representation of |str| or
 * any |JSString*| substring that is a constituent part of it.
 * Returns mozilla::Nothing() on OOM, without reporting an error;
 * some data may have been written to |buffer| when this happens.
 *
 * If there's no OOM, returns the number of code units read and
 * the number of code units written.
 *
 * The semantics of this method match the semantics of
 * TextEncoder.encodeInto().
 *
 * The function does not store an additional zero byte.
 */
JS_PUBLIC_API mozilla::Maybe<mozilla::Tuple<size_t, size_t>>
JS_EncodeStringToUTF8BufferPartial(JSContext* cx, JSString* str,
                                   mozilla::Span<char> buffer);

namespace JS {

/**
 * Maximum length of a JS string. This is chosen so that the number of bytes
 * allocated for a null-terminated TwoByte string still fits in int32_t.
 */
static constexpr uint32_t MaxStringLength = (1 << 30) - 2;

static_assert((uint64_t(MaxStringLength) + 1) * sizeof(char16_t) <= INT32_MAX,
              "size of null-terminated JSString char buffer must fit in "
              "INT32_MAX");

/** Compute the length of a string. */
MOZ_ALWAYS_INLINE size_t GetStringLength(JSString* s) {
  return shadow::AsShadowString(s)->length();
}

/** Compute the length of a linear string. */
MOZ_ALWAYS_INLINE size_t GetLinearStringLength(JSLinearString* s) {
  return shadow::AsShadowString(s)->length();
}

/** Return true iff the given linear string uses Latin-1 storage. */
MOZ_ALWAYS_INLINE bool LinearStringHasLatin1Chars(JSLinearString* s) {
  return shadow::AsShadowString(s)->hasLatin1Chars();
}

/** Return true iff the given string uses Latin-1 storage. */
MOZ_ALWAYS_INLINE bool StringHasLatin1Chars(JSString* s) {
  return shadow::AsShadowString(s)->hasLatin1Chars();
}

/**
 * Given a linear string known to use Latin-1 storage, return a pointer to that
 * storage.  This pointer remains valid only as long as no GC occurs.
 */
MOZ_ALWAYS_INLINE const Latin1Char* GetLatin1LinearStringChars(
    const AutoRequireNoGC& nogc, JSLinearString* linear) {
  return shadow::AsShadowString(linear)->latin1LinearChars();
}

/**
 * Given a linear string known to use two-byte storage, return a pointer to that
 * storage.  This pointer remains valid only as long as no GC occurs.
 */
MOZ_ALWAYS_INLINE const char16_t* GetTwoByteLinearStringChars(
    const AutoRequireNoGC& nogc, JSLinearString* linear) {
  return shadow::AsShadowString(linear)->twoByteLinearChars();
}

/**
 * Given an in-range index into the provided string, return the character at
 * that index.
 */
MOZ_ALWAYS_INLINE char16_t GetLinearStringCharAt(JSLinearString* linear,
                                                 size_t index) {
  shadow::String* s = shadow::AsShadowString(linear);
  MOZ_ASSERT(index < s->length());

  return s->hasLatin1Chars() ? s->latin1LinearChars()[index]
                             : s->twoByteLinearChars()[index];
}

/**
 * Convert an atom to a linear string.  All atoms are linear, so this
 * operation is infallible.
 */
MOZ_ALWAYS_INLINE JSLinearString* AtomToLinearString(JSAtom* atom) {
  return reinterpret_cast<JSLinearString*>(atom);
}

/**
 * If the provided string uses externally-managed storage, return true and set
 * |*callbacks| to the external-string callbacks used to create it and |*chars|
 * to a pointer to its two-byte storage.  (These pointers remain valid as long
 * as the provided string is kept alive.)
 */
MOZ_ALWAYS_INLINE bool IsExternalString(
    JSString* str, const JSExternalStringCallbacks** callbacks,
    const char16_t** chars) {
  shadow::String* s = shadow::AsShadowString(str);

  if (!s->isExternal()) {
    return false;
  }

  *callbacks = s->externalCallbacks;
  *chars = s->nonInlineCharsTwoByte;
  return true;
}

namespace detail {

extern JS_PUBLIC_API JSLinearString* StringToLinearStringSlow(JSContext* cx,
                                                              JSString* str);

}  // namespace detail

/** Convert a string to a linear string. */
MOZ_ALWAYS_INLINE JSLinearString* StringToLinearString(JSContext* cx,
                                                       JSString* str) {
  if (MOZ_LIKELY(shadow::AsShadowString(str)->isLinear())) {
    return reinterpret_cast<JSLinearString*>(str);
  }

  return detail::StringToLinearStringSlow(cx, str);
}

/** Copy characters in |s[start..start + len]| to |dest[0..len]|. */
MOZ_ALWAYS_INLINE void CopyLinearStringChars(char16_t* dest, JSLinearString* s,
                                             size_t len, size_t start = 0) {
#ifdef DEBUG
  size_t stringLen = GetLinearStringLength(s);
  MOZ_ASSERT(start <= stringLen);
  MOZ_ASSERT(len <= stringLen - start);
#endif

  shadow::String* str = shadow::AsShadowString(s);

  if (str->hasLatin1Chars()) {
    const Latin1Char* src = str->latin1LinearChars();
    for (size_t i = 0; i < len; i++) {
      dest[i] = src[start + i];
    }
  } else {
    const char16_t* src = str->twoByteLinearChars();
    std::copy_n(src + start, len, dest);
  }
}

/**
 * Copy characters in |s[start..start + len]| to |dest[0..len]|, lossily
 * truncating 16-bit values to |char| if necessary.
 */
MOZ_ALWAYS_INLINE void LossyCopyLinearStringChars(char* dest, JSLinearString* s,
                                                  size_t len,
                                                  size_t start = 0) {
#ifdef DEBUG
  size_t stringLen = GetLinearStringLength(s);
  MOZ_ASSERT(start <= stringLen);
  MOZ_ASSERT(len <= stringLen - start);
#endif

  shadow::String* str = shadow::AsShadowString(s);

  if (LinearStringHasLatin1Chars(s)) {
    const Latin1Char* src = str->latin1LinearChars();
    for (size_t i = 0; i < len; i++) {
      dest[i] = char(src[start + i]);
    }
  } else {
    const char16_t* src = str->twoByteLinearChars();
    for (size_t i = 0; i < len; i++) {
      dest[i] = char(src[start + i]);
    }
  }
}

/**
 * Copy characters in |s[start..start + len]| to |dest[0..len]|.
 *
 * This function is fallible.  If you already have a linear string, use the
 * infallible |JS::CopyLinearStringChars| above instead.
 */
[[nodiscard]] inline bool CopyStringChars(JSContext* cx, char16_t* dest,
                                          JSString* s, size_t len,
                                          size_t start = 0) {
  JSLinearString* linear = StringToLinearString(cx, s);
  if (!linear) {
    return false;
  }

  CopyLinearStringChars(dest, linear, len, start);
  return true;
}

/**
 * Copy characters in |s[start..start + len]| to |dest[0..len]|, lossily
 * truncating 16-bit values to |char| if necessary.
 *
 * This function is fallible.  If you already have a linear string, use the
 * infallible |JS::LossyCopyLinearStringChars| above instead.
 */
[[nodiscard]] inline bool LossyCopyStringChars(JSContext* cx, char* dest,
                                               JSString* s, size_t len,
                                               size_t start = 0) {
  JSLinearString* linear = StringToLinearString(cx, s);
  if (!linear) {
    return false;
  }

  LossyCopyLinearStringChars(dest, linear, len, start);
  return true;
}

}  // namespace JS

/** DO NOT USE, only present for Rust bindings as a temporary hack */
[[deprecated]] extern JS_PUBLIC_API bool JS_DeprecatedStringHasLatin1Chars(
    JSString* str);

#endif  // js_String_h