mirror of
https://github.com/torproject/collector.git
synced 2024-11-27 03:10:28 +00:00
Reduce memory footprint and wall time.
Adapt to latest changes of metrics-lib (task-25329) and make use of the high redundancy of logs (e.g. a 3G file might only contain 350 different lines). This avoids OOM and array out of bounds exceptions for large files (>2G) and gives a speed-up of roughly 50%. (The earlier 66min are down to 34min for meronense&weschniakowii files plus two larger files.) There is a BATCH constant, which could be tuned for processing speed. It is logged for each webstats module run. Currently, it is set to 100k. This was more or less arbitrarily chosen and used for all the tests. A test run using 500k didn't show significant differences.
This commit is contained in:
parent
fbb35f75da
commit
8557bf6255
@ -5,7 +5,6 @@ package org.torproject.collector.persist;
|
|||||||
|
|
||||||
import org.torproject.descriptor.WebServerAccessLog;
|
import org.torproject.descriptor.WebServerAccessLog;
|
||||||
import org.torproject.descriptor.internal.FileType;
|
import org.torproject.descriptor.internal.FileType;
|
||||||
import org.torproject.descriptor.log.InternalLogDescriptor;
|
|
||||||
import org.torproject.descriptor.log.InternalWebServerAccessLog;
|
import org.torproject.descriptor.log.InternalWebServerAccessLog;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -30,13 +29,6 @@ public class WebServerAccessLogPersistence
|
|||||||
/** Prepare storing the given descriptor. */
|
/** Prepare storing the given descriptor. */
|
||||||
public WebServerAccessLogPersistence(WebServerAccessLog desc) {
|
public WebServerAccessLogPersistence(WebServerAccessLog desc) {
|
||||||
super(desc, new byte[0]);
|
super(desc, new byte[0]);
|
||||||
byte[] compressedBytes = null;
|
|
||||||
try { // The descriptor bytes have to be stored compressed.
|
|
||||||
compressedBytes = COMPRESSION.compress(desc.getRawDescriptorBytes());
|
|
||||||
((InternalLogDescriptor)desc).setRawDescriptorBytes(compressedBytes);
|
|
||||||
} catch (Exception ex) {
|
|
||||||
log.warn("Cannot compress ’{}’. Storing uncompressed.", ex);
|
|
||||||
}
|
|
||||||
calculatePaths();
|
calculatePaths();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
package org.torproject.collector.webstats;
|
package org.torproject.collector.webstats;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.counting;
|
||||||
import static java.util.stream.Collectors.groupingByConcurrent;
|
import static java.util.stream.Collectors.groupingByConcurrent;
|
||||||
import static java.util.stream.Collectors.toList;
|
import static java.util.stream.Collectors.toList;
|
||||||
|
|
||||||
@ -17,6 +18,7 @@ import org.torproject.collector.persist.WebServerAccessLogPersistence;
|
|||||||
import org.torproject.descriptor.DescriptorParseException;
|
import org.torproject.descriptor.DescriptorParseException;
|
||||||
import org.torproject.descriptor.Method;
|
import org.torproject.descriptor.Method;
|
||||||
import org.torproject.descriptor.WebServerAccessLog;
|
import org.torproject.descriptor.WebServerAccessLog;
|
||||||
|
import org.torproject.descriptor.internal.FileType;
|
||||||
import org.torproject.descriptor.log.InternalLogDescriptor;
|
import org.torproject.descriptor.log.InternalLogDescriptor;
|
||||||
import org.torproject.descriptor.log.InternalWebServerAccessLog;
|
import org.torproject.descriptor.log.InternalWebServerAccessLog;
|
||||||
import org.torproject.descriptor.log.WebServerAccessLogImpl;
|
import org.torproject.descriptor.log.WebServerAccessLogImpl;
|
||||||
@ -26,8 +28,9 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
@ -40,6 +43,7 @@ import java.util.SortedSet;
|
|||||||
import java.util.StringJoiner;
|
import java.util.StringJoiner;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -87,6 +91,7 @@ public class SanitizeWeblogs extends CollecTorMain {
|
|||||||
Set<SourceType> sources = this.config.getSourceTypeSet(
|
Set<SourceType> sources = this.config.getSourceTypeSet(
|
||||||
Key.WebstatsSources);
|
Key.WebstatsSources);
|
||||||
if (sources.contains(SourceType.Local)) {
|
if (sources.contains(SourceType.Local)) {
|
||||||
|
log.info("Processing logs using batch value {}.", BATCH);
|
||||||
findCleanWrite(this.config.getPath(Key.WebstatsLocalOrigins));
|
findCleanWrite(this.config.getPath(Key.WebstatsLocalOrigins));
|
||||||
PersistenceUtils.cleanDirectory(this.config.getPath(Key.RecentPath));
|
PersistenceUtils.cleanDirectory(this.config.getPath(Key.RecentPath));
|
||||||
}
|
}
|
||||||
@ -126,24 +131,60 @@ public class SanitizeWeblogs extends CollecTorMain {
|
|||||||
String name = new StringJoiner(InternalLogDescriptor.SEP)
|
String name = new StringJoiner(InternalLogDescriptor.SEP)
|
||||||
.add(virtualHost).add(physicalHost)
|
.add(virtualHost).add(physicalHost)
|
||||||
.add(InternalWebServerAccessLog.MARKER)
|
.add(InternalWebServerAccessLog.MARKER)
|
||||||
.add(date.format(DateTimeFormatter.BASIC_ISO_DATE)).toString();
|
.add(date.format(DateTimeFormatter.BASIC_ISO_DATE))
|
||||||
|
.toString() + "." + FileType.XZ.name().toLowerCase();
|
||||||
log.debug("Sanitizing {}.", name);
|
log.debug("Sanitizing {}.", name);
|
||||||
List<String> retainedLines = lines
|
Map<String, Long> retainedLines = new TreeMap<>(lines
|
||||||
.stream().parallel().map((line) -> sanitize(line, date))
|
.stream().parallel().map((line) -> sanitize(line, date))
|
||||||
.filter((line) -> line.isPresent()).map((line) -> line.get())
|
.filter((line) -> line.isPresent())
|
||||||
.collect(toList());
|
.map((line) -> line.get())
|
||||||
retainedLines.sort(null);
|
.collect(groupingByConcurrent(line -> line, counting())));
|
||||||
|
lines.clear(); // not needed anymore
|
||||||
try {
|
try {
|
||||||
WebServerAccessLogPersistence walp
|
WebServerAccessLogPersistence walp
|
||||||
= new WebServerAccessLogPersistence(
|
= new WebServerAccessLogPersistence(
|
||||||
new WebServerAccessLogImpl(retainedLines, name, false));
|
new WebServerAccessLogImpl(toCompressedBytes(retainedLines),
|
||||||
|
name, false));
|
||||||
log.debug("Storing {}.", name);
|
log.debug("Storing {}.", name);
|
||||||
walp.storeOut(this.outputPathName);
|
walp.storeOut(this.outputPathName);
|
||||||
walp.storeRecent(this.recentPathName);
|
walp.storeRecent(this.recentPathName);
|
||||||
} catch (DescriptorParseException dpe) {
|
} catch (DescriptorParseException dpe) {
|
||||||
log.error("Cannot store log desriptor {}.", name, dpe);
|
log.error("Cannot store log desriptor {}.", name, dpe);
|
||||||
|
} catch (Throwable th) { // catch all else
|
||||||
|
log.error("Serious problem. Cannot store log desriptor {}.", name, th);
|
||||||
}
|
}
|
||||||
lines.clear();
|
}
|
||||||
|
|
||||||
|
private static final int BATCH = 100_000;
|
||||||
|
|
||||||
|
static byte[] toCompressedBytes(Map<String, Long> lines)
|
||||||
|
throws DescriptorParseException {
|
||||||
|
try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
|
OutputStream os = FileType.XZ.outputStream(baos)) {
|
||||||
|
for (Map.Entry<String, Long> entry : lines.entrySet()) {
|
||||||
|
long count = entry.getValue();
|
||||||
|
byte[] batch = bytesFor(entry.getKey(), BATCH);
|
||||||
|
while (count > 0) {
|
||||||
|
if (count > BATCH) {
|
||||||
|
os.write(batch);
|
||||||
|
count -= BATCH;
|
||||||
|
} else {
|
||||||
|
os.write(bytesFor(entry.getKey(), count));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
os.flush();
|
||||||
|
os.close();
|
||||||
|
return baos.toByteArray();
|
||||||
|
} catch (Exception ex) {
|
||||||
|
throw new DescriptorParseException(ex.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] bytesFor(String line, long times) {
|
||||||
|
return Stream.of(line).limit(times)
|
||||||
|
.collect(Collectors.joining("\n", "", "\n")).getBytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
static Optional<String> sanitize(WebServerAccessLogLine logLine,
|
static Optional<String> sanitize(WebServerAccessLogLine logLine,
|
||||||
@ -186,8 +227,8 @@ public class SanitizeWeblogs extends CollecTorMain {
|
|||||||
private Stream<WebServerAccessLogLine> lineStream(LogMetadata metadata) {
|
private Stream<WebServerAccessLogLine> lineStream(LogMetadata metadata) {
|
||||||
log.debug("Processing file {}.", metadata.path);
|
log.debug("Processing file {}.", metadata.path);
|
||||||
try (BufferedReader br
|
try (BufferedReader br
|
||||||
= new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
|
= new BufferedReader(new InputStreamReader(
|
||||||
metadata.fileType.decompress(Files.readAllBytes(metadata.path)))))) {
|
metadata.fileType.decompress(Files.newInputStream(metadata.path))))) {
|
||||||
return br.lines()
|
return br.lines()
|
||||||
.map((String line) -> WebServerAccessLogLine.makeLine(line))
|
.map((String line) -> WebServerAccessLogLine.makeLine(line))
|
||||||
.collect(toList()).stream();
|
.collect(toList()).stream();
|
||||||
|
Loading…
Reference in New Issue
Block a user