Fix encoding problem when parsing multi-descriptor files.

When we're parsing a descriptor file with potentially more than one descriptor in it, we're converting file contents to String to be able to search for descriptor beginnings using String methods. But we're not passing a character encoding, leaving it up to Java to guess. What we should do is tell it to use "US-ASCII" as encoding, which is sufficient to find keywords marking the beginning of a new descriptor. Fixes #11821.
2025-02-14 14:28:52 +00:00 · 2014-05-25 12:32:36 +02:00 · 2014-05-25 12:32:36 +02:00 · b298cbcbd1
commit b298cbcbd1
parent 38c48ddd0c
1 changed files with 7 additions and 1 deletions
--- a/src/org/torproject/descriptor/impl/DescriptorImpl.java
+++ b/src/org/torproject/descriptor/impl/DescriptorImpl.java
@ -2,6 +2,7 @@
 * See LICENSE for licensing information */
 package org.torproject.descriptor.impl;

+import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@ -107,7 +108,12 @@ public abstract class DescriptorImpl implements Descriptor {
      byte[] rawDescriptorBytes, String startToken) {
    List<byte[]> rawDescriptors = new ArrayList<byte[]>();
    String splitToken = "\n" + startToken;
-    String ascii = new String(rawDescriptorBytes);
+    String ascii;
+    try {
+      ascii = new String(rawDescriptorBytes, "US-ASCII");
+    } catch (UnsupportedEncodingException e) {
+      return rawDescriptors;
+    }
    int endAllDescriptors = rawDescriptorBytes.length,
        startAnnotations = 0;
    boolean containsAnnotations = ascii.startsWith("@") ||