diff --git a/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java b/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java index 01a6c7c15ae..d08252f5c12 100644 --- a/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java +++ b/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java @@ -222,6 +222,7 @@ public class AutoBuild { private boolean installDependencies = false; private final VirtualSourceRoot virtualSourceRoot; private ExtractorState state; + private final long maximumFileSizeInMegabytes; /** The default timeout when installing dependencies, in milliseconds. */ public static final int INSTALL_DEPENDENCIES_DEFAULT_TIMEOUT = 10 * 60 * 1000; // 10 minutes @@ -236,6 +237,7 @@ public class AutoBuild { this.defaultEncoding = getEnvVar("LGTM_INDEX_DEFAULT_ENCODING"); this.installDependencies = Boolean.valueOf(getEnvVar("LGTM_INDEX_TYPESCRIPT_INSTALL_DEPS")); this.virtualSourceRoot = makeVirtualSourceRoot(); + this.maximumFileSizeInMegabytes = EnvironmentVariables.getMegabyteCountFromPrefixedEnv("MAX_FILE_SIZE", 10); setupFileTypes(); setupXmlMode(); setupMatchers(); @@ -446,8 +448,8 @@ public class AutoBuild { } /** - * Returns whether the autobuilder has seen code. - * This is overridden in tests. + * Returns whether the autobuilder has seen code. + * This is overridden in tests. */ protected boolean hasSeenCode() { return seenCode; @@ -741,12 +743,12 @@ public class AutoBuild { dependencyInstallationResult = this.preparePackagesAndDependencies(filesToExtract); } Set extractedFiles = new LinkedHashSet<>(); - + // Extract HTML files as they may contain TypeScript CompletableFuture htmlFuture = extractFiles( filesToExtract, extractedFiles, extractors, f -> extractors.fileType(f) == FileType.HTML); - + htmlFuture.join(); // Wait for HTML extraction to be finished. // extract TypeScript projects and files @@ -1229,6 +1231,11 @@ protected DependencyInstallationResult preparePackagesAndDependencies(Set warn("Skipping " + file + ", which does not exist."); return; } + long fileSize = f.length(); + if (fileSize > 1_000_000L * this.maximumFileSizeInMegabytes) { + warn("Skipping " + file + " because it is too large (" + StringUtil.printFloat(fileSize / 1_000_000.0) + " MB). The limit is " + this.maximumFileSizeInMegabytes + " MB."); + return; + } try { long start = logBeginProcess("Extracting " + file); diff --git a/javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java b/javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java index 8ffcb65831c..39dfa70b285 100644 --- a/javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java +++ b/javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java @@ -1,5 +1,6 @@ package com.semmle.js.extractor; +import com.semmle.util.data.UnitParser; import com.semmle.util.exception.UserError; import com.semmle.util.process.Env; import com.semmle.util.process.Env.Var; @@ -7,7 +8,7 @@ import com.semmle.util.process.Env.Var; public class EnvironmentVariables { public static final String CODEQL_EXTRACTOR_JAVASCRIPT_ROOT_ENV_VAR = "CODEQL_EXTRACTOR_JAVASCRIPT_ROOT"; - + public static final String CODEQL_EXTRACTOR_JAVASCRIPT_SCRATCH_DIR_ENV_VAR = "CODEQL_EXTRACTOR_JAVASCRIPT_SCRATCH_DIR"; @@ -19,6 +20,36 @@ public class EnvironmentVariables { public static final String CODEQL_DIST_ENV_VAR = "CODEQL_DIST"; + /** + * Returns a number of megabytes by reading an environment variable with the given suffix, + * or the default value if not set. + *

+ * The following prefixes are tried: + * CODEQL_EXTRACTOR_JAVASCRIPT_, + * LGTM_, + * SEMMLE_. + */ + public static int getMegabyteCountFromPrefixedEnv(String suffix, int defaultValue) { + String envVar = "CODEQL_EXTRACTOR_JAVASCRIPT_" + suffix; + String value = Env.systemEnv().get(envVar); + if (value == null || value.length() == 0) { + envVar = "LGTM_" + suffix; + value = Env.systemEnv().get(envVar); + } + if (value == null || value.length() == 0) { + envVar = "SEMMLE_" + suffix; + value = Env.systemEnv().get(envVar); + } + if (value == null || value.length() == 0) { + return defaultValue; + } + Integer amount = UnitParser.parseOpt(value, UnitParser.MEGABYTES); + if (amount == null) { + throw new UserError("Invalid value for " + envVar + ": '" + value + "'"); + } + return amount; + } + /** * Gets the extractor root based on the CODEQL_EXTRACTOR_JAVASCRIPT_ROOT or * SEMMLE_DIST or environment variable, or null if neither is set. diff --git a/javascript/extractor/src/com/semmle/ts/extractor/TypeScriptParser.java b/javascript/extractor/src/com/semmle/ts/extractor/TypeScriptParser.java index 5da39155347..d19490286b7 100644 --- a/javascript/extractor/src/com/semmle/ts/extractor/TypeScriptParser.java +++ b/javascript/extractor/src/com/semmle/ts/extractor/TypeScriptParser.java @@ -273,23 +273,6 @@ public class TypeScriptParser { return result; } - private static int getMegabyteCountFromPrefixedEnv(String suffix, int defaultValue) { - String envVar = "SEMMLE_" + suffix; - String value = Env.systemEnv().get(envVar); - if (value == null || value.length() == 0) { - envVar = "LGTM_" + suffix; - value = Env.systemEnv().get(envVar); - } - if (value == null || value.length() == 0) { - return defaultValue; - } - Integer amount = UnitParser.parseOpt(value, UnitParser.MEGABYTES); - if (amount == null) { - throw new UserError("Invalid value for " + envVar + ": '" + value + "'"); - } - return amount; - } - /** Start the Node.js parser wrapper process. */ private void setupParserWrapper() { verifyNodeInstallation(); @@ -297,8 +280,8 @@ public class TypeScriptParser { int mainMemoryMb = typescriptRam != 0 ? typescriptRam - : getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_SUFFIX, 2000); - int reserveMemoryMb = getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_RESERVE_SUFFIX, 400); + : EnvironmentVariables.getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_SUFFIX, 2000); + int reserveMemoryMb = EnvironmentVariables.getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_RESERVE_SUFFIX, 400); System.out.println("Memory for TypeScript process: " + mainMemoryMb + " MB, and " + reserveMemoryMb + " MB reserve"); diff --git a/javascript/ql/src/change-notes/2023-08-23-ignore-huge-files.md b/javascript/ql/src/change-notes/2023-08-23-ignore-huge-files.md new file mode 100644 index 00000000000..fc82b3b5a3f --- /dev/null +++ b/javascript/ql/src/change-notes/2023-08-23-ignore-huge-files.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Files larger than 10 MB are no longer be extracted or analyzed.