Auto merge of #219 - emilio:macros, r=fitzgen

Parse macro expressions. Clang is trolling me really hard so I'm going to see if the extra token I'm always getting is LLVM 3.9 specific.
2025-03-04 12:47:22 +00:00 · 2016-11-08 19:35:27 -06:00 · 2016-11-08 19:35:27 -06:00 · 8d83bdfb48
commit 8d83bdfb48
parent c9eccea095 4ee31ce84e
11 changed files with 264 additions and 83 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -31,6 +31,7 @@ env_logger = "0.3"
 rustc-serialize = "0.3.19"
 syntex_syntax = "0.44"
 regex = "0.1"
+cexpr = "0.2"

 [dependencies.aster]
 features = ["with-syntex"]
--- a/src/chooser.rs
+++ b/src/chooser.rs
@ -0,0 +1,14 @@
+//! A public API for more fine-grained customization of bindgen behavior.
+
+pub use ir::int::IntKind;
+use std::fmt;
+
+/// A trait to allow configuring different kinds of types in different
+/// situations.
+pub trait TypeChooser: fmt::Debug {
+    /// The integer kind an integer macro should have, given a name and the
+    /// value of that macro, or `None` if you want the default to be chosen.
+    fn int_macro(&self, _name: &str, _value: i64) -> Option<IntKind> {
+        None
+    }
+}
--- a/src/clang.rs
+++ b/src/clang.rs
@ -4,8 +4,9 @@
 #![allow(non_upper_case_globals, dead_code)]


+use cexpr;
 use clangll::*;
-use std::{mem, ptr};
+use std::{mem, ptr, slice};
 use std::ffi::{CStr, CString};
 use std::fmt;
 use std::hash::Hash;
@ -1050,18 +1051,18 @@ impl TranslationUnit {
        let range = cursor.extent();
        let mut tokens = vec![];
        unsafe {
-            let mut token_ptr = ::std::ptr::null_mut();
+            let mut token_ptr = ptr::null_mut();
            let mut num_tokens: c_uint = 0;
            clang_tokenize(self.x, range, &mut token_ptr, &mut num_tokens);
            if token_ptr.is_null() {
                return None;
            }
-            let token_array = ::std::slice::from_raw_parts(token_ptr,
-                                                           num_tokens as usize);
+
+            let token_array = slice::from_raw_parts(token_ptr,
+                                                    num_tokens as usize);
            for &token in token_array.iter() {
                let kind = clang_getTokenKind(token);
-                let spelling: String = clang_getTokenSpelling(self.x, token)
-                    .into();
+                let spelling = clang_getTokenSpelling(self.x, token).into();

                tokens.push(Token {
                    kind: kind,
@ -1072,6 +1073,62 @@ impl TranslationUnit {
        }
        Some(tokens)
    }
+
+    /// Convert a set of tokens from clang into `cexpr` tokens, for further
+    /// processing.
+    pub fn cexpr_tokens(&self,
+                        cursor: &Cursor)
+                        -> Option<Vec<cexpr::token::Token>> {
+        use cexpr::token;
+
+        let mut tokens = match self.tokens(cursor) {
+            Some(tokens) => tokens,
+            None => return None,
+        };
+
+        // FIXME(emilio): LLVM 3.9 at least always include an extra token for no
+        // good reason (except if we're at EOF). So we do this kind of hack,
+        // where we skip known-to-cause problems trailing punctuation and
+        // trailing keywords.
+        //
+        // This is sort of unfortunate, though :(.
+        //
+        // I'll try to get it fixed in LLVM if I have the time to submit a
+        // patch.
+        let mut trim_last_token = false;
+        if let Some(token) = tokens.last() {
+            // The starting of the next macro.
+            trim_last_token |= token.spelling == "#" &&
+                               token.kind == CXToken_Punctuation;
+
+            // A following keyword of any kind, like a following declaration.
+            trim_last_token |= token.kind == CXToken_Keyword;
+        }
+
+        if trim_last_token {
+            tokens.pop().unwrap();
+        }
+
+        Some(tokens.into_iter()
+            .filter_map(|token| {
+                let kind = match token.kind {
+                    CXToken_Punctuation => token::Kind::Punctuation,
+                    CXToken_Literal => token::Kind::Literal,
+                    CXToken_Identifier => token::Kind::Identifier,
+                    CXToken_Keyword => token::Kind::Keyword,
+                    // NB: cexpr is not too happy about comments inside
+                    // expressions, so we strip them down here.
+                    CXToken_Comment => return None,
+                    _ => panic!("Found unexpected token kind: {}", token.kind),
+                };
+
+                Some(token::Token {
+                    kind: kind,
+                    raw: token.spelling.into_bytes().into_boxed_slice(),
+                })
+            })
+            .collect::<Vec<_>>())
+    }
 }

 impl Drop for TranslationUnit {
--- a/src/codegen/mod.rs
+++ b/src/codegen/mod.rs
@ -1587,8 +1587,19 @@ impl ToRustTy for Type {
                    IntKind::ULong => raw!(c_ulong),
                    IntKind::LongLong => raw!(c_longlong),
                    IntKind::ULongLong => raw!(c_ulonglong),
+
+                    IntKind::I8 => aster::ty::TyBuilder::new().i8(),
+                    IntKind::U8 => aster::ty::TyBuilder::new().u8(),
+                    IntKind::I16 => aster::ty::TyBuilder::new().i16(),
                    IntKind::U16 => aster::ty::TyBuilder::new().u16(),
+                    IntKind::I32 => aster::ty::TyBuilder::new().i32(),
                    IntKind::U32 => aster::ty::TyBuilder::new().u32(),
+                    IntKind::I64 => aster::ty::TyBuilder::new().i64(),
+                    IntKind::U64 => aster::ty::TyBuilder::new().u64(),
+                    IntKind::Custom { name, .. } => {
+                        let ident = ctx.rust_ident_raw(name);
+                        quote_ty!(ctx.ext_cx(), $ident)
+                    }
                    // FIXME: This doesn't generate the proper alignment, but we
                    // can't do better right now. We should be able to use
                    // i128/u128 when they're available.
--- a/src/ir/context.rs
+++ b/src/ir/context.rs
@ -1,10 +1,11 @@
 //! Common context that is passed around during parsing and codegen.

 use BindgenOptions;
+use cexpr;
 use clang::{self, Cursor};
 use parse::ClangItemParser;
-use std::borrow::{Borrow, Cow};
-use std::collections::{HashMap, HashSet, hash_map};
+use std::borrow::Cow;
+use std::collections::{HashMap, hash_map};
 use std::collections::btree_map::{self, BTreeMap};
 use std::fmt;
 use super::int::IntKind;
@ -77,8 +78,9 @@ pub struct BindgenContext<'ctx> {
    pub currently_parsed_types: Vec<(Cursor, ItemId)>,

    /// A HashSet with all the already parsed macro names. This is done to avoid
-    /// hard errors while parsing duplicated macros.
-    parsed_macros: HashSet<String>,
+    /// hard errors while parsing duplicated macros, as well to allow macro
+    /// expression parsing.
+    parsed_macros: HashMap<Vec<u8>, cexpr::expr::EvalResult>,

    /// The active replacements collected from replaces="xxx" annotations.
    replacements: HashMap<String, ItemId>,
@ -243,7 +245,7 @@ impl<'ctx> BindgenContext<'ctx> {

    /// Returns a mangled name as a rust identifier.
    pub fn rust_ident_raw(&self, name: &str) -> Ident {
-        self.ext_cx().ident_of(name.borrow())
+        self.ext_cx().ident_of(name)
    }

    /// Iterate over all items that have been defined.
@ -715,14 +717,21 @@ impl<'ctx> BindgenContext<'ctx> {
    }

    /// Have we parsed the macro named `macro_name` already?
-    pub fn parsed_macro(&self, macro_name: &str) -> bool {
-        self.parsed_macros.contains(macro_name)
+    pub fn parsed_macro(&self, macro_name: &[u8]) -> bool {
+        self.parsed_macros.contains_key(macro_name)
+    }
+
+    /// Get the currently parsed macros.
+    pub fn parsed_macros(&self) -> &HashMap<Vec<u8>, cexpr::expr::EvalResult> {
+        debug_assert!(!self.in_codegen_phase());
+        &self.parsed_macros
    }

    /// Mark the macro named `macro_name` as parsed.
-    pub fn note_parsed_macro(&mut self, macro_name: String) {
-        debug_assert!(!self.parsed_macros.contains(&macro_name));
-        self.parsed_macros.insert(macro_name);
+    pub fn note_parsed_macro(&mut self,
+                             id: Vec<u8>,
+                             value: cexpr::expr::EvalResult) {
+        self.parsed_macros.insert(id, value);
    }

    /// Are we in the codegen phase?
--- a/src/ir/int.rs
+++ b/src/ir/int.rs
@ -36,18 +36,44 @@ pub enum IntKind {
    /// An `unsigned long long`.
    ULongLong,

+    /// A 8-bit signed integer.
+    I8,
+
+    /// A 8-bit unsigned integer.
+    U8,
+
+    /// A 16-bit signed integer.
+    I16,
+
    /// Either a `char16_t` or a `wchar_t`.
    U16,

-    /// A `char32_t`.
+    /// A 32-bit signed integer.
+    I32,
+
+    /// A 32-bit unsigned integer.
    U32,

+    /// A 64-bit signed integer.
+    I64,
+
+    /// A 64-bit unsigned integer.
+    U64,
+
    /// An `int128_t`
    I128,

    /// A `uint128_t`.
-    U128, /* Though now we're at it we could add equivalents for the rust
-           * types... */
+    U128,
+
+    /// A custom integer type, used to allow custom macro types depending on
+    /// range.
+    Custom {
+        /// The name of the type, which would be used without modification.
+        name: &'static str,
+        /// Whether the type is signed or not.
+        is_signed: bool,
+    },
 }

 impl IntKind {
@ -55,10 +81,13 @@ impl IntKind {
    pub fn is_signed(&self) -> bool {
        use self::IntKind::*;
        match *self {
-            Bool | UChar | UShort | UInt | ULong | ULongLong | U16 | U32 |
-            U128 => false,
+            Bool | UChar | UShort | UInt | ULong | ULongLong | U8 | U16 |
+            U32 | U64 | U128 => false,

-            Char | Short | Int | Long | LongLong | I128 => true,
+            Char | Short | Int | Long | LongLong | I8 | I16 | I32 | I64 |
+            I128 => true,
+
+            Custom { is_signed, .. } => is_signed,
        }
    }
 }
--- a/src/ir/var.rs
+++ b/src/ir/var.rs
@ -1,7 +1,9 @@
 //! Intermediate representation of variables.

+use cexpr;
 use clang;
 use parse::{ClangItemParser, ClangSubItemParser, ParseError, ParseResult};
+use std::num::Wrapping;
 use super::context::BindgenContext;
 use super::function::cursor_mangling;
 use super::int::IntKind;
@ -73,43 +75,65 @@ impl ClangSubItemParser for Var {
             ctx: &mut BindgenContext)
             -> Result<ParseResult<Self>, ParseError> {
        use clangll::*;
+        use cexpr::expr::EvalResult;
        match cursor.kind() {
            CXCursor_MacroDefinition => {
-                let value = parse_int_literal_tokens(&cursor,
-                                                     ctx.translation_unit());
+                let value = parse_macro(ctx, &cursor, ctx.translation_unit());

-                let value = match value {
+                let (id, value) = match value {
                    Some(v) => v,
                    None => return Err(ParseError::Continue),
                };

-                let name = cursor.spelling();
-                if name.is_empty() {
-                    warn!("Empty macro name?");
-                    return Err(ParseError::Continue);
-                }
+                assert!(!id.is_empty(), "Empty macro name?");

-                if ctx.parsed_macro(&name) {
+                if ctx.parsed_macro(&id) {
+                    let name = String::from_utf8(id).unwrap();
                    warn!("Duplicated macro definition: {}", name);
                    return Err(ParseError::Continue);
                }
-                ctx.note_parsed_macro(name.clone());

-                let ty = if value < 0 {
-                    Item::builtin_type(TypeKind::Int(IntKind::Int), true, ctx)
-                } else if value.abs() > u32::max_value() as i64 {
-                    Item::builtin_type(TypeKind::Int(IntKind::ULongLong),
-                                       true,
-                                       ctx)
-                } else {
-                    Item::builtin_type(TypeKind::Int(IntKind::UInt), true, ctx)
+                // NB: It's important to "note" the macro even if the result is
+                // not an integer, otherwise we might loose other kind of
+                // derived macros.
+                ctx.note_parsed_macro(id.clone(), value.clone());
+
+                // NOTE: Unwrapping, here and above, is safe, because the
+                // identifier of a token comes straight from clang, and we
+                // enforce utf8 there, so we should have already panicked at
+                // this point.
+                let name = String::from_utf8(id).unwrap();
+                let (int_kind, val) = match value {
+                    // TODO(emilio): Handle the non-invalid ones!
+                    EvalResult::Float(..) |
+                    EvalResult::Char(..) |
+                    EvalResult::Str(..) |
+                    EvalResult::Invalid => return Err(ParseError::Continue),
+
+                    EvalResult::Int(Wrapping(value)) => {
+                        let kind = ctx.options().type_chooser.as_ref()
+                            .and_then(|c| c.int_macro(&name, value))
+                            .unwrap_or_else(|| {
+                                if value < 0 {
+                                    if value < i32::min_value() as i64 {
+                                        IntKind::LongLong
+                                    } else {
+                                        IntKind::Int
+                                    }
+                                } else if value > u32::max_value() as i64 {
+                                    IntKind::ULongLong
+                                } else {
+                                    IntKind::UInt
+                                }
+                            });
+
+                        (kind, value)
+                    }
                };

-                Ok(ParseResult::New(Var::new(name,
-                                             None,
-                                             ty,
-                                             Some(value),
-                                             true),
+                let ty = Item::builtin_type(TypeKind::Int(int_kind), true, ctx);
+
+                Ok(ParseResult::New(Var::new(name, None, ty, Some(val), true),
                                    Some(cursor)))
            }
            CXCursor_VarDecl => {
@ -153,49 +177,43 @@ impl ClangSubItemParser for Var {
    }
 }

-/// Try and parse the immediately found tokens from an unit (if any) to integers
-fn parse_int_literal_tokens(cursor: &clang::Cursor,
-                            unit: &clang::TranslationUnit)
-                            -> Option<i64> {
-    use clangll::{CXToken_Literal, CXToken_Punctuation};
+/// Try and parse a macro using all the macros parsed until now.
+fn parse_macro(ctx: &BindgenContext,
+               cursor: &clang::Cursor,
+               unit: &clang::TranslationUnit)
+               -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
+    use cexpr::{expr, nom};

-    let tokens = match unit.tokens(cursor) {
+    let cexpr_tokens = match unit.cexpr_tokens(cursor) {
        None => return None,
        Some(tokens) => tokens,
    };

-    let mut literal = None;
-    let mut negate = false;
-    for token in tokens.into_iter() {
-        match token.kind {
-            CXToken_Punctuation if token.spelling == "-" => {
-                negate = !negate;
-            }
-            CXToken_Literal => {
-                literal = Some(token.spelling);
-                break;
-            }
-            _ => {
-                // Reset values if we found anything else
-                negate = false;
-                literal = None;
-            }
-        }
-    }
+    let parser = expr::IdentifierParser::new(ctx.parsed_macros());
+    let result = parser.macro_definition(&cexpr_tokens);

-    literal.and_then(|lit| {
-            if lit.starts_with("0x") {
-                // TODO: try to preserve hex literals?
-                i64::from_str_radix(&lit[2..], 16).ok()
-            } else if lit == "0" {
-                Some(0)
-            } else if lit.starts_with("0") {
-                i64::from_str_radix(&lit[1..], 8).ok()
-            } else {
-                lit.parse().ok()
-            }
-        })
-        .map(|lit| if negate { -lit } else { lit })
+    match result {
+        nom::IResult::Done(_, (id, val)) => Some((id.into(), val)),
+        _ => None,
+    }
+}
+
+fn parse_int_literal_tokens(cursor: &clang::Cursor,
+                            unit: &clang::TranslationUnit)
+                            -> Option<i64> {
+    use cexpr::{expr, nom};
+    use cexpr::expr::EvalResult;
+
+    let cexpr_tokens = match unit.cexpr_tokens(cursor) {
+        None => return None,
+        Some(tokens) => tokens,
+    };
+
+    // TODO(emilio): We can try to parse other kinds of literals.
+    match expr::expr(&cexpr_tokens) {
+        nom::IResult::Done(_, EvalResult::Int(Wrapping(val))) => Some(val),
+        _ => None,
+    }
 }

 fn get_integer_literal_from_cursor(cursor: &clang::Cursor,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -24,6 +24,7 @@

 #[macro_use]
 extern crate cfg_if;
+extern crate cexpr;
 extern crate syntex_syntax as syntax;
 extern crate aster;
 extern crate quasi;
@ -62,6 +63,8 @@ mod parse;
 mod regex_set;
 mod uses;

+pub mod chooser;
+
 #[cfg(rustfmt)]
 mod codegen;

@ -237,6 +240,13 @@ impl Builder {
        self
    }

+    /// Allows configuring types in different situations, see the `TypeChooser`
+    /// documentation.
+    pub fn type_chooser(mut self, cb: Box<chooser::TypeChooser>) -> Self {
+        self.options.type_chooser = Some(cb);
+        self
+    }
+
    /// Generate the Rust bindings using the options built up thus far.
    pub fn generate<'ctx>(self) -> Result<Bindings<'ctx>, ()> {
        Bindings::generate(self.options, None)
@ -331,6 +341,10 @@ pub struct BindgenOptions {
    /// Generate a dummy C/C++ file that includes the header and has dummy uses
    /// of all types defined therein. See the `uses` module for more.
    pub dummy_uses: Option<String>,
+
+    /// A user-provided type chooser to allow customizing different kinds of
+    /// situations.
+    pub type_chooser: Option<Box<chooser::TypeChooser>>,
 }

 impl Default for BindgenOptions {
@ -359,6 +373,7 @@ impl Default for BindgenOptions {
            clang_args: vec![],
            input_header: None,
            dummy_uses: None,
+            type_chooser: None,
        }
    }
 }
--- a/tests/expectations/tests/jsval_layout_opaque.rs
+++ b/tests/expectations/tests/jsval_layout_opaque.rs
@ -24,8 +24,9 @@ impl <T> ::std::clone::Clone for __BindgenUnionField<T> {
    fn clone(&self) -> Self { Self::new() }
 }
 impl <T> ::std::marker::Copy for __BindgenUnionField<T> { }
-pub const JSVAL_ALIGNMENT: ::std::os::raw::c_uint = 8;
 pub const JSVAL_TAG_SHIFT: ::std::os::raw::c_uint = 47;
+pub const JSVAL_PAYLOAD_MASK: ::std::os::raw::c_ulonglong = 140737488355327;
+pub const JSVAL_TAG_MASK: ::std::os::raw::c_longlong = -140737488355328;
 #[repr(u8)]
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum JSValueType {
--- a/tests/expectations/tests/macro-expr-basic.rs
+++ b/tests/expectations/tests/macro-expr-basic.rs
@ -0,0 +1,14 @@
+/* automatically generated by rust-bindgen */
+
+
+#![allow(non_snake_case)]
+
+
+pub const FOO: ::std::os::raw::c_uint = 1;
+pub const BAR: ::std::os::raw::c_uint = 4;
+pub const BAZ: ::std::os::raw::c_uint = 5;
+pub const BARR: ::std::os::raw::c_uint = 1;
+pub const BAZZ: ::std::os::raw::c_uint = 7;
+pub const I_RAN_OUT_OF_DUMB_NAMES: ::std::os::raw::c_uint = 7;
+pub const HAZ_A_COMMENT: ::std::os::raw::c_uint = 1;
+pub const HAZ_A_COMMENT_INSIDE: ::std::os::raw::c_uint = 2;
--- a/tests/headers/macro-expr-basic.h
+++ b/tests/headers/macro-expr-basic.h
@ -0,0 +1,12 @@
+#define FOO 1
+#define BAR 4
+#define BAZ (FOO + BAR)
+
+#define BARR (1 << 0)
+#define BAZZ ((1 << 1) + BAZ)
+#define I_RAN_OUT_OF_DUMB_NAMES (BARR | BAZZ)
+
+/* I haz a comment */
+#define HAZ_A_COMMENT BARR
+
+#define HAZ_A_COMMENT_INSIDE (/* comment for real */ BARR + FOO)