scummvm/engines/sci/parser/vocabulary.h

/* ScummVM - Graphic Adventure Engine
 *
 * ScummVM is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.

 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.

 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 */

#ifndef SCI_SCICORE_VOCABULARY_H
#define SCI_SCICORE_VOCABULARY_H

#include "common/str.h"
#include "common/hashmap.h"
#include "common/hash-str.h"
#include "common/list.h"

#include "sci/sci.h"
#include "sci/engine/vm_types.h"

namespace Common {

class Serializer;

}

namespace Sci {

class ResourceManager;

/*#define VOCABULARY_DEBUG */

enum {
	VOCAB_RESOURCE_SELECTORS = 997,

	VOCAB_RESOURCE_SCI0_MAIN_VOCAB = 0,
	VOCAB_RESOURCE_SCI0_PARSE_TREE_BRANCHES = 900,
	VOCAB_RESOURCE_SCI0_SUFFIX_VOCAB = 901,

	VOCAB_RESOURCE_SCI1_MAIN_VOCAB = 900,
	VOCAB_RESOURCE_SCI1_PARSE_TREE_BRANCHES = 901,
	VOCAB_RESOURCE_SCI1_SUFFIX_VOCAB = 902,

	VOCAB_RESOURCE_ALT_INPUTS = 913
};


enum {
	VOCAB_CLASS_PREPOSITION = 0x01,
	VOCAB_CLASS_ARTICLE = 0x02,
	VOCAB_CLASS_ADJECTIVE = 0x04,
	VOCAB_CLASS_PRONOUN = 0x08,
	VOCAB_CLASS_NOUN = 0x10,
	VOCAB_CLASS_INDICATIVE_VERB = 0x20,
	VOCAB_CLASS_ADVERB = 0x40,
	VOCAB_CLASS_IMPERATIVE_VERB = 0x80,
	VOCAB_CLASS_NUMBER = 0x001
};

enum {
	kParseEndOfInput = 0,
	kParseOpeningParenthesis = 1,
	kParseClosingParenthesis = 2,
	kParseNil = 3,
	kParseNumber = 4
};

#define VOCAB_MAX_WORDLENGTH 256

/* Anywords are ignored by the parser */
#define VOCAB_CLASS_ANYWORD 0xff

/* This word class is used for numbers */
#define VOCAB_MAGIC_NUMBER_GROUP 0xffd /* 0xffe ? */
#define VOCAB_MAGIC_NOTHING_GROUP 0xffe

/* Number of nodes for each ParseTreeNode structure */
#define VOCAB_TREE_NODES 500

#define VOCAB_TREE_NODE_LAST_WORD_STORAGE 0x140
#define VOCAB_TREE_NODE_COMPARE_TYPE 0x146
#define VOCAB_TREE_NODE_COMPARE_GROUP 0x14d
#define VOCAB_TREE_NODE_FORCE_STORAGE 0x154

#define SAID_COMMA   0xf0
#define SAID_AMP     0xf1
#define SAID_SLASH   0xf2
#define SAID_PARENO  0xf3
#define SAID_PARENC  0xf4
#define SAID_BRACKO  0xf5
#define SAID_BRACKC  0xf6
#define SAID_HASH    0xf7
#define SAID_LT      0xf8
#define SAID_GT      0xf9
#define SAID_TERM    0xff

#define SAID_FIRST SAID_COMMA

/* There was no 'last matching word': */
#define SAID_FULL_MATCH 0xffff
#define SAID_NO_MATCH 0xfffe
#define SAID_PARTIAL_MATCH 0xfffd

#define SAID_LONG(x) ((x) << 8)

struct ResultWord {
	int _class; /**< Word class */
	int _group; /**< Word group */
};

typedef Common::List<ResultWord> ResultWordList;
typedef Common::List<ResultWordList> ResultWordListList;

typedef Common::HashMap<Common::String, ResultWordList, Common::CaseSensitiveString_Hash, Common::CaseSensitiveString_EqualTo> WordMap;


struct ParseRuleList;

struct suffix_t {

	int class_mask; /**< the word class this suffix applies to */
	int result_class; /**< the word class a word is morphed to if it doesn't fail this check */

	int alt_suffix_length; /**< String length of the suffix */
	int word_suffix_length; /**< String length of the other suffix */

	const char *alt_suffix; /**< The alternative suffix */
	const char *word_suffix; /**< The suffix as used in the word vocabulary */

};

typedef Common::List<suffix_t> SuffixList;


struct synonym_t {
	uint16 replaceant; /**< The word group to replace */
	uint16 replacement; /**< The replacement word group for this one */
};

typedef Common::Array<synonym_t> SynonymList;


struct AltInput {
	const char *_input;
	const char *_replacement;
	unsigned int _inputLength;
	bool _prefix;
};


struct parse_tree_branch_t {
	int id;
	int data[10];
};

enum ParseTypes {
	kParseTreeWordNode = 4,
	kParseTreeLeafNode = 5,
	kParseTreeBranchNode = 6
};

struct ParseTreeNode {
	ParseTypes type;  /**< leaf or branch */
	int value; /**< For leaves */
	ParseTreeNode* left; /**< Left child, for branches */
	ParseTreeNode* right; /**< Right child, for branches (and word leaves) */
};

enum VocabularyVersions {
	kVocabularySCI0 = 0,
	kVocabularySCI1 = 1
};

class Vocabulary {
public:
	Vocabulary(ResourceManager *resMan, bool foreign);
	~Vocabulary();

	// reset parser status
	void reset();

	/**
	 * Gets any word from the specified group. For debugging only.
	 * @param group		Group number
	 */
	const char *getAnyWordFromGroup(int group);


	/**
	 * Looks up a single word in the words and suffixes list.
	 * @param retval	the list of matches
	 * @param word		pointer to the word to look up
	 * @param word_len	length of the word to look up
	 */
	void lookupWord(ResultWordList &retval, const char *word, int word_len);


	/**
	 * Tokenizes a string and compiles it into word_ts.
	 * @param[in] retval		A list of words which will be set to the result
	 * @param[out] sentence		The sentence to examine
	 * @param[out] error		Points to a malloc'd copy of the offending text or to NULL on error
	 * @return true on success, false on failure
	 *
	 * On error, false is returned. If *error is NULL, the sentence did not
	 * contain any useful words; if not, *error points to a malloc'd copy of
	 * the offending word. The returned list may contain anywords.
	 */
	bool tokenizeString(ResultWordListList &retval, const char *sentence, char **error);

	/**
	 * Builds a parse tree from a list of words, using a set of Greibach Normal
	 * Form rules.
	 * @param words		The words to build the tree from
	 * @param verbose	Set to true for debugging
	 * @return	0 on success, 1 if the tree couldn't be built in VOCAB_TREE_NODES
	 *			nodes or if the sentence structure in 'words' is not part of the
	 *			language described by the grammar passed in 'rules'.
	 */
	int parseGNF(const ResultWordListList &words, bool verbose = false);

	/**
	 * Constructs the Greibach Normal Form of the grammar supplied in 'branches'.
	 * @param verbose	Set to true for debugging. If true, the list is
	 *					freed before the function ends
	 * @return	Pointer to a list of singly linked GNF rules describing the same
	 *			language that was described by 'branches'
	 *
	 * The original SCI rules are in almost-CNF (Chomsky Normal Form). Note that
	 * branch[0] is used only for a few magical incantations, as it is treated
	 * specially by the SCI parser.
	 */
	ParseRuleList *buildGNF(bool verbose = false);

	/**
	 * Deciphers a said block and dumps its content via debugN.
	 * For debugging only.
	 * @param pos	pointer to the data to dump
	 */
	void debugDecipherSaidBlock(const byte *pos);

	/**
	 * Prints the parser suffixes to the debug console.
	 */
	void printSuffixes() const;

	/**
	 * Prints the parser words to the debug console.
	 */
	void printParserWords() const;

	uint getParserBranchesSize() const { return _parserBranches.size(); }
	const parse_tree_branch_t &getParseTreeBranch(int number) const { return _parserBranches[number]; }

	/**
	 * Adds a new synonym to the list
	 */
	void addSynonym(synonym_t syn) { _synonyms.push_back(syn); }

	/**
	 * Clears the list of synonyms
	 */
	void clearSynonyms() { _synonyms.clear(); }

	/**
	 * Synonymizes a token list
	 * Parameters: (ResultWordListList &) words: The word list to synonymize
	 */
	void synonymizeTokens(ResultWordListList &words);

	void printParserNodes(int num);

	void dumpParseTree();

	int parseNodes(int *i, int *pos, int type, int nr, int argc, const char **argv);

	/**
	 * Check text input against alternative inputs.
	 * @param text The text to process. It will be modified in-place
	 * @param cursorPos The cursor position
	 * @return true if anything changed
	 */
	bool checkAltInput(Common::String& text, uint16& cursorPos);

	/**
	 * Save/load vocabulary data
	 */
	void saveLoadWithSerializer(Common::Serializer &ser);

private:
	/**
	 * Loads all words from the main vocabulary.
	 * @return true on success, false on failure
	 */
	bool loadParserWords();

	/**
	 * Loads all suffixes from the suffix vocabulary.
	 * @return true on success, false on failure
	 */
	bool loadSuffixes();

	/**
	 * Frees all suffixes in the given list.
	 * @param suffixes: The suffixes to free
	 */
	void freeSuffixes();

	/**
	 * Retrieves all grammar rules from the resource data.
	 * @param branches		The rules are stored into this Array
	 * @return true on success, false on error
	 */
	bool loadBranches();

	/**
	 * Frees a parser rule list as returned by vocab_build_gnf().
	 * @param rule_list		the rule list to free
	 */
	void freeRuleList(ParseRuleList *rule_list);


	/**
	 * Retrieves all alternative input combinations from vocab 913.
	 * @return true on success, false on error
	 */
	bool loadAltInputs();

	/**
	 * Frees all alternative input combinations.
	 */
	void freeAltInputs();

	ResourceManager *_resMan;
	VocabularyVersions _vocabVersion;

	bool _foreign;
	uint16 _resourceIdWords;
	uint16 _resourceIdSuffixes;
	uint16 _resourceIdBranches;

	// Parser-related lists
	SuffixList _parserSuffixes;
	ParseRuleList *_parserRules; /**< GNF rules used in the parser algorithm */
	Common::Array<parse_tree_branch_t> _parserBranches;
	WordMap _parserWords;
	SynonymList _synonyms; /**< The list of synonyms */
	Common::Array<Common::List<AltInput> > _altInputs;

public:
	// Accessed by said()
	ParseTreeNode _parserNodes[VOCAB_TREE_NODES]; /**< The parse tree */

	// Parser data:
	reg_t parser_event; /**< The event passed to Parse() and later used by Said() */
	bool parserIsValid; /**< If something has been correctly parsed */
};

/**
 * Prints a parse tree.
 * @param tree_name		Name of the tree to dump (free-form)
 * @param nodes			The nodes containing the parse tree
 */
void vocab_dump_parse_tree(const char *tree_name, ParseTreeNode *nodes);


/**
 * Builds a parse tree from a spec and compares it to a parse tree.
 * @param spec		Pointer to the spec to build
 * @param verbose	Whether to display the parse tree after building it
 * @return 1 on a match, 0 otherwise
 */
int said(const byte *spec, bool verbose);

} // End of namespace Sci

#endif // SCI_SCICORE_VOCABULARY_H