labwc/scripts/helper/find-idents.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Helper to find identifier names in C files
 *
 * Copyright (C) Johan Malm 2023
 *
 * It tokenizes the specified C file and searches all identifier-tokens against
 * the specified patterns.
 *
 * An identifier in this context is any alphanumeric/underscore string starting
 * with a letter [A-Za-z] or underscore. It represents entities such as
 * functions, variables, user-defined data types and C language keywords.
 * Alphanumeric strings within comments are ignored, but not parsing of tokens
 * is carried out to understand their semantic meaning.
 */

#define _POSIX_C_SOURCE 200809L
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct buf {
	char *buf;
	int alloc;
	int len;
};

enum token_kind {
	TOKEN_NONE = 0,
	TOKEN_IDENTIFIER, /* For example: static extern if while */
	TOKEN_LITERAL, /* For example: 0xff 42 "foo" */
	TOKEN_SPECIAL, /* For example: ++ -= ! ... */
};

struct token {
	int line;
	enum token_kind kind;
	struct buf name;
	unsigned int special;
};

enum {
	SPECIAL_ELLIPSIS = 256,
	SPECIAL_ASSIGN,
	SPECIAL_BIT_OP,
	SPECIAL_INC_OP,
	SPECIAL_DEC_OP,
	SPECIAL_PTR_OP,
	SPECIAL_AND_OP,
	SPECIAL_OR_OP,
	SPECIAL_COMPARISON_OP,
	SPECIAL_COMMENT_BEGIN,
	SPECIAL_COMMENT_END,
	SPECIAL_COMMENT_LINE_BEGIN,
};

static char *current_buffer_position;
static struct token *tokens;
static int nr_tokens, alloc_tokens;
static int current_line;
static char **argv_tokens;
static int found_token;

static const char find_banned_usage[] =
"Usage: find-banned [OPTIONS...] FILE\n"
"When FILE is -, read stdin\n"
"OPTIONS:\n"
"  --tokens=<tokens>     Comma-separated string of idents to grep for\n";

static void
usage(void)
{
	printf("%s", find_banned_usage);
	exit(0);
}

char **
split(char *str, char delim)
{
	if (!str) {
		return NULL;
	}
	int argc = 1;
	char *p = str;
	while (*p) {
		if (*p == delim) {
			argc++;
		}
		++p;
	}

	char **argv = calloc(argc + 1, sizeof(*argv));
	char **argvp = argv;
	p = str;
	while (*str) {
		if (*str == delim) {
			*argvp++ = strndup(p, str-p);
			p = str + 1;
		}
		++str;
	}
	*argvp++ = strndup(p, str-p);
	*argvp = NULL;
	return argv;
}

void
buf_init(struct buf *s)
{
	s->alloc = 256;
	s->buf = malloc(s->alloc);
	s->buf[0] = '\0';
	s->len = 0;
}

void
buf_add(struct buf *s, const char *data, size_t len)
{
	if (!data || data[0] == '\0') {
		return;
	}
	if (s->alloc <= s->len + len + 1) {
		s->alloc = s->alloc + len;
		s->buf = realloc(s->buf, s->alloc);
	}
	memcpy(s->buf + s->len, data, len);
	s->len += len;
	s->buf[s->len] = 0;
}

void
buf_add_char(struct buf *s, char ch)
{
	if (s->alloc <= s->len + 1) {
		s->alloc = s->alloc * 2 + 16;
		s->buf = realloc(s->buf, s->alloc);
	}
	s->buf[s->len++] = ch;
	s->buf[s->len] = 0;
}

static struct token *
add_token(void)
{
	if (nr_tokens == alloc_tokens) {
		alloc_tokens = (alloc_tokens + 16) * 2;
		tokens = realloc(tokens, alloc_tokens * sizeof(struct token));
	}
	struct token *token = tokens + nr_tokens;
	memset(token, 0, sizeof(*token));
	nr_tokens++;
	buf_init(&token->name);
	token->line = current_line;
	return token;
}

static void
get_identifier_token(struct token *token)
{
	buf_add_char(&token->name, current_buffer_position[0]);
	current_buffer_position++;
	if (isspace(current_buffer_position[0])) {
		return;
	}
	switch (current_buffer_position[0]) {
	case '\0':
		break;
	case 'a' ... 'z':
	case 'A' ... 'Z':
	case '0' ... '9':
	case '_':
	case '#':
		get_identifier_token(token);
		break;
	default:
		break;
	}
}

static void
get_number_token(struct token *token)
{
	buf_add_char(&token->name, current_buffer_position[0]);
	current_buffer_position++;
	if (isspace(current_buffer_position[0])) {
		return;
	}
	switch (current_buffer_position[0]) {
	case '\0':
		break;
	case '0' ... '9':
	case 'a' ... 'f':
	case 'A' ... 'F':
	case 'x':
		get_number_token(token);
		break;
	default:
		break;
	}
}

struct {
	const char *combo;
	unsigned int special;
} specials[] = {
	{ "...", SPECIAL_ELLIPSIS },
	{ ">>=", SPECIAL_ASSIGN },
	{ "<<=", SPECIAL_ASSIGN },
	{ "+=", SPECIAL_ASSIGN },
	{ "-=", SPECIAL_ASSIGN },
	{ "*=", SPECIAL_ASSIGN },
	{ "/=", SPECIAL_ASSIGN },
	{ "%=", SPECIAL_ASSIGN },
	{ "&=", SPECIAL_ASSIGN },
	{ "^=", SPECIAL_ASSIGN },
	{ "|=", SPECIAL_ASSIGN },
	{ ">>", SPECIAL_BIT_OP },
	{ "<<", SPECIAL_BIT_OP },
	{ "++", SPECIAL_INC_OP },
	{ "--", SPECIAL_DEC_OP },
	{ "->", SPECIAL_PTR_OP },
	{ "&&", SPECIAL_AND_OP },
	{ "||", SPECIAL_OR_OP },
	{ "<=", SPECIAL_COMPARISON_OP },
	{ ">=", SPECIAL_COMPARISON_OP },
	{ "==", SPECIAL_COMPARISON_OP },
	{ "!=", SPECIAL_COMPARISON_OP },
	{ "/*", SPECIAL_COMMENT_BEGIN },
	{ "*/", SPECIAL_COMMENT_END },
	{ "//", SPECIAL_COMMENT_LINE_BEGIN },
	{ ";", ';' },
	{ "{", '{' },
	{ "}", '}' },
	{ ",", ',' },
	{ ":", ':' },
	{ "=", '=' },
	{ "(", '(' },
	{ ")", ')' },
	{ "[", '[' },
	{ "]", ']' },
	{ ".", '.' },
	{ "&", '&' },
	{ "!", '!' },
	{ "~", '~' },
	{ "-", '-' },
	{ "+", '+' },
	{ "*", '*' },
	{ "/", '/' },
	{ "%", '%' },
	{ "<", '<' },
	{ ">", '>' },
	{ "^", '^' },
	{ "|", '|' },
	{ "?", '?' },
};

static void
get_special_token(struct token *token)
{
#define MAX_SPECIAL_LEN (3)
	/* Peek up to MAX_SPECIAL_LEN-1 characters ahead */
	char buf[MAX_SPECIAL_LEN + 1] = { 0 };
	for (int i = 0; i < MAX_SPECIAL_LEN; i++) {
		buf[i] = current_buffer_position[i];
		if (!current_buffer_position[i]) {
			break;
		}
	}
#undef MAX_SPECIAL_LEN

	/* Compare with longest special tokens first */
	int k;
	for (k = strlen(buf); k > 0; k--) {
		for (size_t j = 0; j < sizeof(specials) / sizeof(specials[0]); j++) {
			if (strlen(specials[j].combo) < k) {
				break;
			}
			if (!strcmp(specials[j].combo, buf)) {
				buf_add(&token->name, buf, k);
				token->special = specials[j].special;
				goto done;
			}
		}
		buf[k - 1] = '\0';
	}
done:
	current_buffer_position += token->name.len;
}

static void
handle_preprocessor_directive(void)
{
	/* We just ignore preprocessor lines */
	for (;;) {
		++current_buffer_position;
		if (current_buffer_position[0] == '\0') {
			return;
		}
		if (current_buffer_position[0] == '\n') {
			++current_line;
			return;
		}
	}
}

struct token *
lex(char *buffer)
{
	tokens = NULL;
	nr_tokens = 0;
	alloc_tokens = 0;
	bool in_single_comment = false;

	current_buffer_position = buffer;

	for (;;) {
		struct token *token = NULL;
		switch (current_buffer_position[0]) {
		case '\0':
			goto out;
		case 'a' ... 'z':
		case 'A' ... 'Z':
		case '_':
			token = add_token();
			get_identifier_token(token);
			token->kind = TOKEN_IDENTIFIER;
			continue;
		case '0' ... '9':
			token = add_token();
			get_number_token(token);
			token->kind = TOKEN_LITERAL;
			continue;
		case '+': case '-': case '*': case '/': case '%': case '.':
		case '>': case '<': case '=': case '!': case '&': case '|':
		case '^': case '{': case '}': case '(': case ')': case ',':
		case ';': case ':': case '[': case ']': case '~': case '?':
			token = add_token();
			get_special_token(token);
			token->kind = TOKEN_SPECIAL;
			if (token->special == SPECIAL_COMMENT_LINE_BEGIN) {
				token->special = SPECIAL_COMMENT_BEGIN;
				in_single_comment = true;
			}
			continue;
		case '#':
			handle_preprocessor_directive();
			break;
		case '\n':
			if (in_single_comment) {
				token = add_token();
				token->kind = TOKEN_SPECIAL;
				token->special = SPECIAL_COMMENT_END;
				in_single_comment = false;
			}
			++current_line;
			break;
		default:
			break;
		}
		++current_buffer_position;
	}
out:
	add_token(); /* end marker */
	return tokens;
}

char *
read_file(const char *filename)
{
	char *line = NULL;
	size_t len = 0;
	FILE *stream = fopen(filename, "r");
	if (!stream) {
		fprintf(stderr, "warn: cannot read '%s'\n", filename);
		return NULL;
	}
	struct buf buffer;
	buf_init(&buffer);
	while ((getline(&line, &len, stream) != -1)) {
		buf_add(&buffer, line, strlen(line));
	}
	free(line);
	fclose(stream);
	return buffer.buf;
}

static bool
grep(struct token *tokens, const char *filename, const char *pattern)
{
	bool found = false;
	unsigned int in_comment = 0;

	for (struct token *t = tokens; t->kind; t++) {
		if (t->kind == TOKEN_SPECIAL) {
			if (t->special == SPECIAL_COMMENT_BEGIN) {
				++in_comment;
			} else if (t->special == SPECIAL_COMMENT_END) {
				--in_comment;
			}
		}
		if (in_comment) {
			continue;
		}
		if (t->kind == TOKEN_IDENTIFIER) {
			if (!pattern || !strcmp(t->name.buf, pattern)) {
				found = true;
				printf("%s:%d\t%s\n", filename, t->line, t->name.buf);
			}
		}
	}
	return found;
}

static void
process_one_file(const char *filename)
{
	struct token *tokens;
	char *buffer = read_file(filename);
	if (!buffer) {
		exit(EXIT_FAILURE);
	}
	current_line = 1;
	tokens = lex(buffer);
	free(buffer);

	if (!argv_tokens) {
		/* Dump all idents */
		grep(tokens, filename, NULL);
	} else {
		for (char **p = argv_tokens; *p; p++) {
			found_token |= grep(tokens, filename, *p);
		}
	}
}

int
main(int argc, char **argv)
{
	if (argc < 2) {
		usage();
	}
	for (int i = 1; i < argc; ++i) {
		char *arg = argv[i];

		if (!strncmp(arg, "--tokens=", 9)) {
			argv_tokens = split(arg + 9, ',');
		}
		if (!strcmp(arg, "-")) {
			char *line = NULL;
			size_t len = 0;
			while ((getline(&line, &len, stdin) != -1)) {
				char *p = strrchr(line, '\n');
				if (p) {
					*p = '\0';
				}
				process_one_file(line);
			}
			free(line);
			break;
		}
		if (arg[0] != '-') {
			process_one_file(arg);
			break;
		}
	}

	if (argv_tokens) {
		for (char **p = argv_tokens; *p; p++) {
			free(*p);
		}
		free(argv_tokens);
	}

	/* return failure (1) if we have found a banned identifier */
	return found_token;
}