mirror of
https://github.com/torproject/collector.git
synced 2024-11-23 17:39:45 +00:00
Add database importer as new data sink for relay descriptors.
This commit is contained in:
parent
4e9d449805
commit
c5c7ff2807
6
config
6
config
@ -55,6 +55,12 @@
|
||||
## Import directory archives from disk, if available
|
||||
#ImportDirectoryArchives 1
|
||||
|
||||
## Write relay descriptors to a database for later evaluation
|
||||
#WriteRelayDescriptorDatabase 0
|
||||
|
||||
## JDBC string for relay descriptor database
|
||||
#RelayDescriptorDatabaseJDBC jdbc:postgresql:tordir?user=ernie&password=password
|
||||
|
||||
## Import sanitized bridges from disk, if available
|
||||
#ImportSanitizedBridges 1
|
||||
|
||||
|
BIN
doc/manual.pdf
Normal file
BIN
doc/manual.pdf
Normal file
Binary file not shown.
415
doc/manual.tex
Normal file
415
doc/manual.tex
Normal file
@ -0,0 +1,415 @@
|
||||
\documentclass{article}
|
||||
\begin{document}
|
||||
\title{ERNIE: a tool to study the Tor network\\-- User's Guide --}
|
||||
\author{by Karsten Loesing \texttt{<karsten@torproject.org>}}
|
||||
\maketitle
|
||||
|
||||
\section{Overview}
|
||||
|
||||
Welcome to ERNIE!
|
||||
ERNIE is a tool to study the Tor network.
|
||||
ERNIE has been designed to process all kinds of data about the Tor network
|
||||
and visualize them or prepare them for further analysis.
|
||||
ERNIE is also the software behind the Tor Metrics Portal
|
||||
\verb+http://metrics.torproject.org/+.
|
||||
|
||||
The acronym ERNIE stands for the \emph{Enhanced R-based tor Network
|
||||
Intelligence Engine} (sorry for misspelling Tor).
|
||||
Why ERNIE?
|
||||
Because nobody liked BIRT (Business Intelligence and Reporting Tools) that
|
||||
we used for visualizing statistics about the Tor network before writing
|
||||
our own software.
|
||||
By the way, reasons were that BIRT made certain people's browsers crash
|
||||
and requires JavaScript that most Tor user have turned off.
|
||||
|
||||
If you want to learn more about the Tor network, regardless of whether you
|
||||
want to present your findings on a website (like ERNIE does) or include
|
||||
them in your next Tor paper, this user's guide is for you!
|
||||
|
||||
\section{Getting started with ERNIE}
|
||||
|
||||
The ERNIE project was started as a simple tool to parse Tor relay
|
||||
descriptors and plot graphs on Tor network usage for a website.
|
||||
Since then, ERNIE has grown to a tool that can process all kinds of Tor
|
||||
network data for various purposes, including but not limited to
|
||||
visualization.
|
||||
|
||||
We think that the easiest way to get started with ERNIE is to walk through
|
||||
typical use cases in a tutorial style and explain what is required to set
|
||||
up ERNIE.
|
||||
These use cases have been chosen from what we think are typical
|
||||
applications of ERNIE.
|
||||
|
||||
\subsection{Visualizing network statistics}
|
||||
|
||||
{\it Write me.}
|
||||
|
||||
\subsection{Importing relay descriptors into a database}
|
||||
|
||||
As of February 2010, the relays and directories in the Tor network
|
||||
generate more than 1 GB of descriptors every month.
|
||||
There are two approaches to process these amounts of data:
|
||||
extract only the relevant data for the analysis and write them to files,
|
||||
or import all data to a database and run queries on the database.
|
||||
ERNIE currently takes the file-based approach for the Metrics Portal,
|
||||
which works great for standardized analyses.
|
||||
But the more flexible way to research the Tor network is to work with a
|
||||
database.
|
||||
|
||||
This tutorial describes how to import relay descriptors into a database
|
||||
and run a few example queries.
|
||||
Note that the presented database schema is limited to answering basic
|
||||
questions about the Tor network.
|
||||
In order to answer more complex questions, one would have to extend the
|
||||
database schema and Java classes which is sketched at the end of this
|
||||
tutorial.
|
||||
|
||||
\subsubsection{Preparing database for data import}
|
||||
|
||||
The first step in importing relay descriptors into a database is to
|
||||
install a database management system.
|
||||
We won't go into the details of installing a database for the various
|
||||
operating systems in this tutorial.
|
||||
Please consult the tutorials and manuals that are out on the Web.
|
||||
For this tutorial, we assume that you have PostgreSQL 8.4 installed.
|
||||
Note that in theory, any other relational database that has a working JDBC
|
||||
4 driver should work, too, possibly with minor modifications to ERNIE.
|
||||
We further assume a database user called \verb+ernie+ that is allowed to
|
||||
define, modify, and query database objects.
|
||||
|
||||
First, create a new database schema \verb+tordir+ with two tables that we
|
||||
need for importing relay descriptors, plus two indexes to accelerate
|
||||
queries. Note that \verb+$+ denotes a shell prompt and \verb+tordir=>+ the
|
||||
database prompt.
|
||||
|
||||
\begin{verbatim}
|
||||
$ createdb -U ernie -O ernie tordir
|
||||
$ psql -U ernie tordir
|
||||
tordir=> CREATE TABLE statusentry (
|
||||
validafter TIMESTAMP NOT NULL,
|
||||
descriptor CHAR(40) NOT NULL,
|
||||
isauthority BOOLEAN NOT NULL DEFAULT false,
|
||||
isbadexit BOOLEAN NOT NULL DEFAULT false,
|
||||
isbaddirectory BOOLEAN NOT NULL DEFAULT false,
|
||||
isexit BOOLEAN NOT NULL DEFAULT false,
|
||||
isfast BOOLEAN NOT NULL DEFAULT false,
|
||||
isguard BOOLEAN NOT NULL DEFAULT false,
|
||||
ishsdir BOOLEAN NOT NULL DEFAULT false,
|
||||
isnamed BOOLEAN NOT NULL DEFAULT false,
|
||||
isstable BOOLEAN NOT NULL DEFAULT false,
|
||||
isrunning BOOLEAN NOT NULL DEFAULT false,
|
||||
isunnamed BOOLEAN NOT NULL DEFAULT false,
|
||||
isvalid BOOLEAN NOT NULL DEFAULT false,
|
||||
isv2dir BOOLEAN NOT NULL DEFAULT false,
|
||||
isv3dir BOOLEAN NOT NULL DEFAULT false,
|
||||
PRIMARY KEY (validafter, descriptor));
|
||||
tordir=> CREATE TABLE descriptor (
|
||||
descriptor CHAR(40) NOT NULL PRIMARY KEY,
|
||||
address VARCHAR(15) NOT NULL,
|
||||
orport INTEGER NOT NULL,
|
||||
dirport INTEGER NOT NULL,
|
||||
bandwidthavg BIGINT NOT NULL,
|
||||
bandwidthburst BIGINT NOT NULL,
|
||||
bandwidthobserved BIGINT NOT NULL,
|
||||
platform VARCHAR(256),
|
||||
published TIMESTAMP NOT NULL,
|
||||
uptime BIGINT);
|
||||
tordir=> CREATE INDEX statusvalidafter
|
||||
ON statusentry (validafter);
|
||||
tordir=> CREATE INDEX descriptorid
|
||||
ON descriptor (descriptor);
|
||||
tordir=> \q
|
||||
\end{verbatim}
|
||||
|
||||
A row in the \verb+statusentry+ table contains the information that a
|
||||
given relay (that has published the server descriptor with ID
|
||||
\verb+descriptor+) was contained in the network status consensus published
|
||||
at time \verb+validafter+.
|
||||
These two fields uniquely identify a row in the \verb+statusentry+ table.
|
||||
The other fields contain boolean values for the flags that the directory
|
||||
authorities assigned to the relay in this consensus, e.g., the Exit flag
|
||||
in \verb+isexit+.
|
||||
Note that for the 24 network status consensuses of a given day with each
|
||||
of them containing 2000 relays, there will be $24 \times 2000$ rows in the
|
||||
\verb+statusentry+ table.
|
||||
|
||||
The \verb+descriptor+ table contains some portion of the information that
|
||||
a relay includes in its server descriptor.
|
||||
Descriptors are identified by the \verb+descriptor+ field which
|
||||
corresponds to the \verb+descriptor+ field in the \verb+statusentry+
|
||||
table.
|
||||
The other fields contain further data of the server descriptor that might
|
||||
be relevant for analyses, e.g., the platform line with the Tor software
|
||||
version and operating system of the relay.
|
||||
|
||||
Obviously, this data schema doesn't match everyone's needs.
|
||||
See the instructions below for extending ERNIE to import other data into
|
||||
the database.
|
||||
|
||||
\subsubsection{Downloading relay descriptors from the metrics website}
|
||||
|
||||
In the next step you will probably want to download relay descriptors from
|
||||
the metrics website
|
||||
\verb+http://metrics.torproject.org/data.html#relaydesc+.
|
||||
Download the \verb+v3 consensuses+ and/or \verb+server descriptors+ of the
|
||||
months you want to analyze.
|
||||
The server descriptors are the documents that relays publish at least
|
||||
every 18 hours describing their capabilities, whereas the v3 consensuses
|
||||
are views of the directory authorities on the available relays at a given
|
||||
time.
|
||||
For this tutorial you need both v3 consensuses and server descriptors.
|
||||
You might want to start with a single month of data, experiment with it,
|
||||
and import more data later on.
|
||||
Extract the tarballs to a new directory \verb+archives/+ in the ERNIE
|
||||
working directory.
|
||||
|
||||
\subsubsection{Configuring ERNIE to import relay descriptors into a
|
||||
database}
|
||||
|
||||
ERNIE can be used to read data from one or more data sources and write
|
||||
them to one or more data sinks.
|
||||
You need to configure ERNIE so that it knows to use the downloaded relay
|
||||
descriptors as data source and the database as data sink.
|
||||
You have implicitly accomplished the former by creating the
|
||||
\verb+archives/+ directory.
|
||||
By default, ERNIE looks for this directory and tries to import everything
|
||||
contained in it.
|
||||
You could change this behavior by explicitly telling ERNIE not to import
|
||||
data from the \verb+archives/+ directory by adding a line
|
||||
\verb+ImportDirectoryArchives 0+ to the config file, but this is not what
|
||||
we want in this tutorial.
|
||||
But you need to explicitly enable your database as a data sink.
|
||||
Add the following line to your \verb+config+ file:
|
||||
|
||||
\begin{verbatim}
|
||||
WriteRelayDescriptorDatabase 1
|
||||
\end{verbatim}
|
||||
|
||||
You further need to provide the JDBC string that ERNIE shall use to access
|
||||
the database schema \verb+tordir+ that we created above.
|
||||
The config option with the JDBC string for a local PostgreSQL database
|
||||
might be (without line break):
|
||||
|
||||
\begin{verbatim}
|
||||
RelayDescriptorDatabaseJDBC
|
||||
jdbc:postgresql:tordir?user=ernie&password=password
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{Importing relay descriptors using ERNIE}
|
||||
|
||||
Now you are ready to actually import relay descriptors using ERNIE.
|
||||
Compile the Java classes and run ERNIE.
|
||||
|
||||
\begin{verbatim}
|
||||
$ ./download.sh
|
||||
$ ./run.sh
|
||||
\end{verbatim}
|
||||
|
||||
Note that the import process might take between a few minutes and an hour,
|
||||
depending on your hardware.
|
||||
You will notice that ERNIE doesn't progress messages to the standard
|
||||
output.
|
||||
You can either change this behavior by setting
|
||||
\verb+java.util.logging.ConsoleHandler.level+ in
|
||||
\verb+logging.properties+ to \verb+INFO+ or \verb+FINE+.
|
||||
Alternately, you can look at the log file \verb+log.0+ that is created by
|
||||
ERNIE.
|
||||
|
||||
If ERNIE finishes after a few seconds, you have probably put the relay
|
||||
descriptors at the wrong place.
|
||||
Make sure that you extract the relay descriptors to sub directories of
|
||||
\verb+archives/+ in the ERNIE working directory.
|
||||
|
||||
If you interrupt ERNIE, or if ERNIE terminates uncleanly for some reason,
|
||||
you will have problems starting it the next time.
|
||||
ERNIE uses a local lock file called \verb+lock+ to make sure that only a
|
||||
single instance of ERNIE is running at a time.
|
||||
If you are sure that the last ERNIE instance isn't running anymore, you
|
||||
can remove the lock file and start ERNIE again.
|
||||
|
||||
If all goes well, you should now have the relay descriptors of 1 month in
|
||||
your database.
|
||||
|
||||
\subsubsection{Example queries}
|
||||
|
||||
In this tutorial, we want to give you a few examples for using the
|
||||
database schema with the imported relay descriptors to extract some useful
|
||||
statistics about the Tor network.
|
||||
|
||||
In the first example we want to find out how many relays have been running
|
||||
on average per day and how many of these relays were exit relays.
|
||||
We only need the \verb+statusentry+ table for this evaluation, because
|
||||
the information we are interested in is contained in the network status
|
||||
consensuses.
|
||||
|
||||
The SQL statement that we need for this evaluation consists of two parts:
|
||||
First, we find out how many network status consensuses have been published
|
||||
on any given day.
|
||||
Second, we count all relays and those with the Exit flag and divide these
|
||||
numbers by the number of network status consensuses per day.
|
||||
|
||||
\begin{verbatim}
|
||||
$ psql -U ernie tordir
|
||||
tordir=> SELECT DATE(validafter),
|
||||
COUNT(*) / relay_statuses_per_day.count AS avg_running,
|
||||
SUM(CASE WHEN isexit IS TRUE THEN 1 ELSE 0 END) /
|
||||
relay_statuses_per_day.count AS avg_exit
|
||||
FROM statusentry,
|
||||
(SELECT COUNT(*) AS count, DATE(validafter) AS date
|
||||
FROM (SELECT DISTINCT validafter FROM statusentry)
|
||||
distinct_consensuses
|
||||
GROUP BY DATE(validafter)) relay_statuses_per_day
|
||||
WHERE DATE(validafter) = relay_statuses_per_day.date
|
||||
GROUP BY DATE(validafter), relay_statuses_per_day.count
|
||||
ORDER BY DATE(validafter);
|
||||
tordir=> \q
|
||||
\end{verbatim}
|
||||
|
||||
Executing this query should finish within a few seconds to one minute,
|
||||
again depending on your hardware.
|
||||
The result might start like this (truncated here):
|
||||
|
||||
\begin{verbatim}
|
||||
date | avg_running | avg_exit
|
||||
------------+-------------+----------
|
||||
2010-02-01 | 1583 | 627
|
||||
2010-02-02 | 1596 | 638
|
||||
2010-02-03 | 1600 | 654
|
||||
:
|
||||
\end{verbatim}
|
||||
|
||||
In the second example we want to find out what Tor software versions the
|
||||
relays have been running.
|
||||
More precisely, we want to know how many relays have been running what Tor
|
||||
versions on micro version granularity (e.g., 0.2.2) on average per day?
|
||||
|
||||
We need to combine network status consensuses with server descriptors to
|
||||
find out this information, because the version information is not
|
||||
contained in the consensuses (or at least, it's optional to be contained
|
||||
in there; and after all, this is just an example).
|
||||
Note that we cannot focus on server descriptors only and leave out the
|
||||
consensuses for this analysis, because we want our analysis to be limited
|
||||
to running relays as confirmed by the directory authorities and not
|
||||
include all descriptors that happened to be published at a given day.
|
||||
|
||||
The SQL statement again determines the number of consensuses per day in a
|
||||
sub query.
|
||||
In the next step, we join the \verb+statusentry+ table with the
|
||||
\verb+descriptor+ table for all rows contained in the \verb+statusentry+
|
||||
table.
|
||||
The left join means that we include \verb+statusentry+ rows even if we do
|
||||
not have corresponding lines in the \verb+descriptor+ table.
|
||||
We determine the version by skipping the first 4 characters of the platform
|
||||
string that should contain \verb+"Tor "+ (without quotes) and cutting off
|
||||
after another 5 characters.
|
||||
Obviously, this approach is prone to errors if the platform line format
|
||||
changes, but it should be sufficient for this example.
|
||||
|
||||
\begin{verbatim}
|
||||
$ psql -U ernie tordir
|
||||
tordir=> SELECT DATE(validafter) AS date,
|
||||
SUBSTRING(platform, 5, 5) AS version,
|
||||
COUNT(*) / relay_statuses_per_day.count AS count
|
||||
FROM
|
||||
(SELECT COUNT(*) AS count, DATE(validafter) AS date
|
||||
FROM (SELECT DISTINCT validafter
|
||||
FROM statusentry) distinct_consensuses
|
||||
GROUP BY DATE(validafter)) relay_statuses_per_day
|
||||
JOIN statusentry
|
||||
ON relay_statuses_per_day.date = DATE(validafter)
|
||||
LEFT JOIN descriptor
|
||||
ON statusentry.descriptor = descriptor.descriptor
|
||||
GROUP BY DATE(validafter), SUBSTRING(platform, 5, 5),
|
||||
relay_statuses_per_day.count, relay_statuses_per_day.date
|
||||
ORDER BY DATE(validafter), SUBSTRING(platform, 5, 5);
|
||||
tordir=> \q
|
||||
\end{verbatim}
|
||||
|
||||
Running this query takes longer than the first query, which can be a few
|
||||
minutes to half an hour.
|
||||
The main reason is that joining the two tables is an expensive database
|
||||
operation.
|
||||
If you plan to perform many evaluations like this one, you might want to
|
||||
create a third table that holds the results of joining the two tables of
|
||||
this tutorial.
|
||||
Creating such a table to speed up queries is not specific to ERNIE and
|
||||
beyond the scope of this tutorial.
|
||||
|
||||
The (truncated) result of the query might look like this:
|
||||
|
||||
\begin{verbatim}
|
||||
date | version | count
|
||||
------------+---------+-------
|
||||
2010-02-01 | 0.1.2 | 10
|
||||
2010-02-01 | 0.2.0 | 217
|
||||
2010-02-01 | 0.2.1 | 774
|
||||
2010-02-01 | 0.2.2 | 75
|
||||
2010-02-01 | | 505
|
||||
2010-02-02 | 0.1.2 | 14
|
||||
2010-02-02 | 0.2.0 | 328
|
||||
2010-02-02 | 0.2.1 | 1143
|
||||
2010-02-02 | 0.2.2 | 110
|
||||
:
|
||||
\end{verbatim}
|
||||
|
||||
Note that, in the fifth line, we are missing the server descriptors of 505
|
||||
relays contained in network status consensuses published on 2010-02-01.
|
||||
If you want to avoid such missing values, you'll have to import the server
|
||||
descriptors of the previous month, too.
|
||||
|
||||
\subsubsection{Extending ERNIE to import further data into the database}
|
||||
|
||||
In this tutorial we have explained how to prepare a database, download
|
||||
relay descriptors, configure ERNIE, import the descriptors, and execute
|
||||
example queries.
|
||||
This description is limited to a few examples by the very nature of a
|
||||
tutorial.
|
||||
If you want to extend ERNIE to import further data into your database,
|
||||
you will have to perform at least two steps:
|
||||
extend the database schema and modify the Java classes used for parsing.
|
||||
|
||||
The first step, extending the database schema, is not specific to ERNIE.
|
||||
Just add the fields and tables to the schema definition.
|
||||
|
||||
The second step, modifying the Java classes used for parsing, is of course
|
||||
specific to ERNIE.
|
||||
You will have to look at two classes in particular:
|
||||
The first class, \verb+RelayDescriptorDatabaseImporter+, contains the
|
||||
prepared statements and methods used to add network status consensus
|
||||
entries and server descriptors to the database.
|
||||
The second class, \verb+RelayDescriptorParser+, contains the parsing logic
|
||||
for the relay descriptors and decides what information to add to the
|
||||
database, among other things.
|
||||
|
||||
This ends the tutorial on importing relay descriptors into a database.
|
||||
Happy researching!
|
||||
|
||||
\subsection{Aggregating relay and bridge descriptors}
|
||||
|
||||
{\it Write me.}
|
||||
|
||||
\section{Software architecture}
|
||||
|
||||
{\it Write me. In particular, include overview of components:
|
||||
|
||||
\begin{itemize}
|
||||
\item Data sources and data sinks
|
||||
\item Java classes with data sources and data sinks
|
||||
\item R scripts to process CSV output
|
||||
\item Website
|
||||
\end{itemize}
|
||||
}
|
||||
|
||||
\section{Tor Metrics Portal setup}
|
||||
|
||||
{\it
|
||||
Write me. In particular, include documentation of deployed ERNIE that
|
||||
runs the metrics website.
|
||||
This documentation has two purposes:
|
||||
First, a reference setup can help others creating their own ERNIE
|
||||
configuration that goes beyond the use cases as described above.
|
||||
Second, we need to remember how things are configured anyway, so we can
|
||||
as well document them here.}
|
||||
|
||||
\end{document}
|
||||
|
BIN
lib/postgresql-8.4-701.jdbc4.jar
Normal file
BIN
lib/postgresql-8.4-701.jdbc4.jar
Normal file
Binary file not shown.
2
run.sh
2
run.sh
@ -1,3 +1,3 @@
|
||||
#!/bin/sh
|
||||
java -Xms128m -Xmx1024m -cp bin/:lib/commons-codec-1.4.jar:lib/commons-compress-1.0.jar -Djava.util.logging.config.file=logging.properties Main
|
||||
java -Xms128m -Xmx1024m -cp bin/:lib/commons-codec-1.4.jar:lib/commons-compress-1.0.jar:lib/postgresql-8.4-701.jdbc4.jar -Djava.util.logging.config.file=logging.properties Main
|
||||
|
||||
|
@ -25,6 +25,9 @@ public class Configuration {
|
||||
private boolean writeDirectoryArchives = false;
|
||||
private boolean importCachedRelayDescriptors = true;
|
||||
private boolean importDirectoryArchives = true;
|
||||
private boolean writeRelayDescriptorDatabase = false;
|
||||
private String relayDescriptorDatabaseJdbc = "jdbc:postgresql:tordir?"
|
||||
+ "user=ernie&password=password";
|
||||
private boolean importSanitizedBridges = true;
|
||||
private boolean importBridgeSnapshots = true;
|
||||
private boolean importWriteTorperfStats = true;
|
||||
@ -90,6 +93,11 @@ public class Configuration {
|
||||
} else if (line.startsWith("ImportDirectoryArchives")) {
|
||||
this.importDirectoryArchives = Integer.parseInt(
|
||||
line.split(" ")[1]) != 0;
|
||||
} else if (line.startsWith("WriteRelayDescriptorDatabase")) {
|
||||
this.writeRelayDescriptorDatabase = Integer.parseInt(
|
||||
line.split(" ")[1]) != 0;
|
||||
} else if (line.startsWith("RelayDescriptorDatabaseJDBC")) {
|
||||
this.relayDescriptorDatabaseJdbc = line.split(" ")[1];
|
||||
} else if (line.startsWith("ImportSanitizedBridges")) {
|
||||
this.importSanitizedBridges = Integer.parseInt(
|
||||
line.split(" ")[1]) != 0;
|
||||
@ -195,6 +203,12 @@ public class Configuration {
|
||||
public boolean getImportDirectoryArchives() {
|
||||
return this.importDirectoryArchives;
|
||||
}
|
||||
public boolean getWriteRelayDescriptorDatabase() {
|
||||
return this.writeRelayDescriptorDatabase;
|
||||
}
|
||||
public String getRelayDescriptorDatabaseJDBC() {
|
||||
return this.relayDescriptorDatabaseJdbc;
|
||||
}
|
||||
public boolean getImportSanitizedBridges() {
|
||||
return this.importSanitizedBridges;
|
||||
}
|
||||
|
@ -42,14 +42,21 @@ public class Main {
|
||||
ArchiveWriter aw = config.getWriteDirectoryArchives() ?
|
||||
new ArchiveWriter() : null;
|
||||
|
||||
// Prepare writing relay descriptors to database
|
||||
RelayDescriptorDatabaseImporter rddi =
|
||||
config.getWriteRelayDescriptorDatabase() ?
|
||||
new RelayDescriptorDatabaseImporter(
|
||||
config.getRelayDescriptorDatabaseJDBC()) : null;
|
||||
|
||||
// Prepare relay descriptor parser (only if we are writing stats or
|
||||
// directory archives to disk)
|
||||
RelayDescriptorParser rdp = config.getWriteConsensusStats() ||
|
||||
config.getWriteBridgeStats() || config.getWriteDirreqStats() ||
|
||||
config.getWriteServerDescriptorStats() ||
|
||||
config.getWriteDirectoryArchives() ?
|
||||
new RelayDescriptorParser(csfh, bsfh, dsfh, sdsfh, aw, countries,
|
||||
directories) : null;
|
||||
config.getWriteDirectoryArchives() ||
|
||||
config.getWriteRelayDescriptorDatabase() ?
|
||||
new RelayDescriptorParser(csfh, bsfh, dsfh, sdsfh, aw, rddi,
|
||||
countries, directories) : null;
|
||||
|
||||
// Import/download relay descriptors from the various sources
|
||||
if (rdp != null) {
|
||||
@ -58,9 +65,10 @@ public class Main {
|
||||
List<String> dirSources =
|
||||
config.getDownloadFromDirectoryAuthorities();
|
||||
boolean downloadCurrentConsensus = aw != null || csfh != null ||
|
||||
bsfh != null || sdsfh != null;
|
||||
bsfh != null || sdsfh != null || rddi != null;
|
||||
boolean downloadCurrentVotes = aw != null;
|
||||
boolean downloadAllServerDescriptors = aw != null || sdsfh != null;
|
||||
boolean downloadAllServerDescriptors = aw != null ||
|
||||
sdsfh != null || rddi != null;
|
||||
boolean downloadAllExtraInfos = aw != null;
|
||||
Set<String> downloadDescriptorsForRelays = directories;
|
||||
rdd = new RelayDescriptorDownloader(rdp, dirSources,
|
||||
|
168
src/RelayDescriptorDatabaseImporter.java
Normal file
168
src/RelayDescriptorDatabaseImporter.java
Normal file
@ -0,0 +1,168 @@
|
||||
import java.sql.*;
|
||||
import java.util.*;
|
||||
import java.util.logging.*;
|
||||
|
||||
/**
|
||||
* Parse directory data.
|
||||
*/
|
||||
public final class RelayDescriptorDatabaseImporter {
|
||||
|
||||
/**
|
||||
* Relay descriptor database connection.
|
||||
*/
|
||||
private Connection conn;
|
||||
|
||||
/**
|
||||
* Prepared statement to check whether a given network status consensus
|
||||
* entry has been imported into the database before.
|
||||
*/
|
||||
private PreparedStatement psRs;
|
||||
|
||||
/**
|
||||
* Prepared statement to check whether a given server descriptor has
|
||||
* been imported into the database before.
|
||||
*/
|
||||
private PreparedStatement psDs;
|
||||
|
||||
/**
|
||||
* Prepared statement to insert a network status consensus entry into
|
||||
* the database.
|
||||
*/
|
||||
private PreparedStatement psR;
|
||||
|
||||
/**
|
||||
* Prepared statement to insert a server descriptor into the database.
|
||||
*/
|
||||
private PreparedStatement psD;
|
||||
|
||||
/**
|
||||
* Logger for this class.
|
||||
*/
|
||||
private Logger logger;
|
||||
|
||||
/**
|
||||
* Initialize database importer by connecting to the database and
|
||||
* preparing statements.
|
||||
*/
|
||||
public RelayDescriptorDatabaseImporter(String connectionURL) {
|
||||
|
||||
/* Initialize logger. */
|
||||
this.logger = Logger.getLogger(
|
||||
RelayDescriptorDatabaseImporter.class.getName());
|
||||
|
||||
try {
|
||||
/* Connect to database. */
|
||||
this.conn = DriverManager.getConnection(connectionURL);
|
||||
|
||||
/* Prepare statements. */
|
||||
this.psRs = conn.prepareStatement("SELECT COUNT(*) "
|
||||
+ "FROM statusentry WHERE validafter = ? AND descriptor = ?");
|
||||
this.psDs = conn.prepareStatement("SELECT COUNT(*) "
|
||||
+ "FROM descriptor WHERE descriptor = ?");
|
||||
this.psR = conn.prepareStatement("INSERT INTO statusentry "
|
||||
+ "(validafter, descriptor, isauthority, isbadexit, "
|
||||
+ "isbaddirectory, isexit, isfast, isguard, ishsdir, isnamed, "
|
||||
+ "isstable, isrunning, isunnamed, isvalid, isv2dir, isv3dir) "
|
||||
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
|
||||
this.psD = conn.prepareStatement("INSERT INTO descriptor "
|
||||
+ "(descriptor, address, orport, dirport, bandwidthavg, "
|
||||
+ "bandwidthburst, bandwidthobserved, platform, published, "
|
||||
+ "uptime) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
|
||||
|
||||
} catch (SQLException e) {
|
||||
this.logger.log(Level.WARNING, "Could not connect to database or "
|
||||
+ "prepare statements.", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert network status consensus entry into database.
|
||||
*/
|
||||
public void addStatusEntry(long validAfter, String descriptor,
|
||||
SortedSet<String> flags) {
|
||||
if (this.psRs == null || this.psR == null) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
|
||||
Timestamp validAfterTimestamp = new Timestamp(validAfter);
|
||||
this.psRs.setTimestamp(1, validAfterTimestamp, cal);
|
||||
this.psRs.setString(2, descriptor);
|
||||
ResultSet rs = psRs.executeQuery();
|
||||
rs.next();
|
||||
if (rs.getInt(1) > 0) {
|
||||
return;
|
||||
}
|
||||
this.psR.clearParameters();
|
||||
this.psR.setTimestamp(1, validAfterTimestamp, cal);
|
||||
this.psR.setString(2, descriptor);
|
||||
this.psR.setBoolean(3, flags.contains("Authority"));
|
||||
this.psR.setBoolean(4, flags.contains("BadExit"));
|
||||
this.psR.setBoolean(5, flags.contains("BadDirectory"));
|
||||
this.psR.setBoolean(6, flags.contains("Exit"));
|
||||
this.psR.setBoolean(7, flags.contains("Fast"));
|
||||
this.psR.setBoolean(8, flags.contains("Guard"));
|
||||
this.psR.setBoolean(9, flags.contains("HSDir"));
|
||||
this.psR.setBoolean(10, flags.contains("Named"));
|
||||
this.psR.setBoolean(11, flags.contains("Stable"));
|
||||
this.psR.setBoolean(12, flags.contains("Running"));
|
||||
this.psR.setBoolean(13, flags.contains("Unnamed"));
|
||||
this.psR.setBoolean(14, flags.contains("Valid"));
|
||||
this.psR.setBoolean(15, flags.contains("V2Dir"));
|
||||
this.psR.setBoolean(16, flags.contains("V3Dir"));
|
||||
this.psR.execute();
|
||||
} catch (SQLException e) {
|
||||
this.logger.log(Level.WARNING, "Could not add network status "
|
||||
+ "consensus entry.", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert server descriptor into database.
|
||||
*/
|
||||
public void addServerDescriptor(String descriptor, String address,
|
||||
int orPort, int dirPort, long bandwidthAvg, long bandwidthBurst,
|
||||
long bandwidthObserved, String platform, long published,
|
||||
long uptime) {
|
||||
if (this.psDs == null || this.psD == null) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
|
||||
this.psDs.setString(1, descriptor);
|
||||
ResultSet rs = psDs.executeQuery();
|
||||
rs.next();
|
||||
if (rs.getInt(1) > 0) {
|
||||
return;
|
||||
}
|
||||
this.psD.clearParameters();
|
||||
this.psD.setString(1, descriptor);
|
||||
this.psD.setString(2, address);
|
||||
this.psD.setInt(3, orPort);
|
||||
this.psD.setInt(4, dirPort);
|
||||
this.psD.setLong(5, bandwidthAvg);
|
||||
this.psD.setLong(6, bandwidthBurst);
|
||||
this.psD.setLong(7, bandwidthObserved);
|
||||
this.psD.setString(8, platform);
|
||||
this.psD.setTimestamp(9, new Timestamp(published), cal);
|
||||
this.psD.setLong(10, uptime);
|
||||
this.psD.execute();
|
||||
} catch (SQLException e) {
|
||||
this.logger.log(Level.WARNING, "Could not add server descriptor.",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the relay descriptor database connection.
|
||||
*/
|
||||
public void closeConnection() {
|
||||
try {
|
||||
this.conn.close();
|
||||
} catch (SQLException e) {
|
||||
this.logger.log(Level.WARNING, "Could not close database "
|
||||
+ "connection.", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -48,6 +48,12 @@ public class RelayDescriptorParser {
|
||||
*/
|
||||
private RelayDescriptorDownloader rdd;
|
||||
|
||||
/**
|
||||
* Relay descriptor database importer that stores relay descriptor
|
||||
* contents for later evaluation.
|
||||
*/
|
||||
private RelayDescriptorDatabaseImporter rddi;
|
||||
|
||||
/**
|
||||
* Countries that we care about for directory request and bridge
|
||||
* statistics.
|
||||
@ -70,14 +76,19 @@ public class RelayDescriptorParser {
|
||||
public RelayDescriptorParser(ConsensusStatsFileHandler csfh,
|
||||
BridgeStatsFileHandler bsfh, DirreqStatsFileHandler dsfh,
|
||||
ServerDescriptorStatsFileHandler sdsfh, ArchiveWriter aw,
|
||||
SortedSet<String> countries, SortedSet<String> directories) {
|
||||
RelayDescriptorDatabaseImporter rddi, SortedSet<String> countries,
|
||||
SortedSet<String> directories) {
|
||||
this.csfh = csfh;
|
||||
this.bsfh = bsfh;
|
||||
this.dsfh = dsfh;
|
||||
this.sdsfh = sdsfh;
|
||||
this.aw = aw;
|
||||
this.rddi = rddi;
|
||||
this.countries = countries;
|
||||
this.directories = directories;
|
||||
|
||||
/* Initialize logger. */
|
||||
this.logger = Logger.getLogger(RelayDescriptorParser.class.getName());
|
||||
}
|
||||
|
||||
public void setRelayDescriptorDownloader(
|
||||
@ -107,7 +118,8 @@ public class RelayDescriptorParser {
|
||||
// consensuses
|
||||
boolean isConsensus = true;
|
||||
int exit = 0, fast = 0, guard = 0, running = 0, stable = 0;
|
||||
String validAfterTime = null, descriptorIdentity = null;
|
||||
String validAfterTime = null, descriptorIdentity = null,
|
||||
serverDesc = null;
|
||||
StringBuilder descriptorIdentities = new StringBuilder();
|
||||
String fingerprint = null;
|
||||
long validAfter = -1L;
|
||||
@ -130,7 +142,7 @@ public class RelayDescriptorParser {
|
||||
String relayIdentity = Hex.encodeHexString(
|
||||
Base64.decodeBase64(line.split(" ")[2] + "=")).
|
||||
toLowerCase();
|
||||
String serverDesc = Hex.encodeHexString(Base64.decodeBase64(
|
||||
serverDesc = Hex.encodeHexString(Base64.decodeBase64(
|
||||
line.split(" ")[3] + "=")).toLowerCase();
|
||||
serverDescriptors.add(publishedTime + "," + relayIdentity
|
||||
+ "," + serverDesc);
|
||||
@ -146,6 +158,15 @@ public class RelayDescriptorParser {
|
||||
running++;
|
||||
descriptorIdentities.append("," + descriptorIdentity);
|
||||
}
|
||||
if (this.rddi != null) {
|
||||
SortedSet<String> flags = new TreeSet<String>();
|
||||
if (line.length() > 2) {
|
||||
for (String flag : line.substring(2).split(" ")) {
|
||||
flags.add(flag);
|
||||
}
|
||||
}
|
||||
this.rddi.addStatusEntry(validAfter, serverDesc, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isConsensus) {
|
||||
@ -194,7 +215,11 @@ public class RelayDescriptorParser {
|
||||
String platformLine = null, publishedLine = null,
|
||||
publishedTime = null, bandwidthLine = null,
|
||||
extraInfoDigest = null, relayIdentifier = null;
|
||||
long published = -1L;
|
||||
String[] parts = line.split(" ");
|
||||
String address = parts[2];
|
||||
int orPort = Integer.parseInt(parts[3]);
|
||||
int dirPort = Integer.parseInt(parts[4]);
|
||||
long published = -1L, uptime = -1L;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("platform ")) {
|
||||
platformLine = line;
|
||||
@ -214,6 +239,8 @@ public class RelayDescriptorParser {
|
||||
extraInfoDigest = line.startsWith("opt ") ?
|
||||
line.split(" ")[2].toLowerCase() :
|
||||
line.split(" ")[1].toLowerCase();
|
||||
} else if (line.startsWith("uptime ")) {
|
||||
uptime = Long.parseLong(line.substring("uptime ".length()));
|
||||
}
|
||||
}
|
||||
String ascii = new String(data, "US-ASCII");
|
||||
@ -240,6 +267,16 @@ public class RelayDescriptorParser {
|
||||
this.sdsfh.addServerDescriptor(descriptorIdentity, platformLine,
|
||||
publishedLine, bandwidthLine);
|
||||
}
|
||||
if (this.rddi != null && digest != null) {
|
||||
String[] bwParts = bandwidthLine.split(" ");
|
||||
long bandwidthAvg = Long.parseLong(bwParts[1]);
|
||||
long bandwidthBurst = Long.parseLong(bwParts[2]);
|
||||
long bandwidthObserved = Long.parseLong(bwParts[3]);
|
||||
String platform = platformLine.substring("platform ".length());
|
||||
this.rddi.addServerDescriptor(digest, address, orPort, dirPort,
|
||||
bandwidthAvg, bandwidthBurst, bandwidthObserved, platform,
|
||||
published, uptime);
|
||||
}
|
||||
} else if (line.startsWith("extra-info ")) {
|
||||
String publishedTime = null, relayIdentifier = line.split(" ")[2];
|
||||
long published = -1L;
|
||||
|
Loading…
Reference in New Issue
Block a user