From 29031a4c85b75e5ed61c01b7fb5a94edd2fb9bdd Mon Sep 17 00:00:00 2001 From: Johan Malm Date: Mon, 30 Jan 2023 21:24:52 +0000 Subject: [PATCH] Add scripts/find-banned.sh --- scripts/.gitignore | 2 + scripts/find-banned.sh | 6 + scripts/helper/Makefile | 12 ++ scripts/helper/find-idents.c | 405 +++++++++++++++++++++++++++++++++++ 4 files changed, 425 insertions(+) create mode 100644 scripts/.gitignore create mode 100755 scripts/find-banned.sh create mode 100644 scripts/helper/Makefile create mode 100644 scripts/helper/find-idents.c diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 00000000..ed76aa70 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,2 @@ +*.o +find-banned diff --git a/scripts/find-banned.sh b/scripts/find-banned.sh new file mode 100755 index 00000000..25e915f9 --- /dev/null +++ b/scripts/find-banned.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +banned="malloc,g_strcmp0,sprintf,vsprintf,strcpy,strncpy,strcat,strncat" + +find src/ include/ \( -name "*.c" -o -name "*.h" \) -type f \ + | ./scripts/helper/find-idents --tokens=$banned - diff --git a/scripts/helper/Makefile b/scripts/helper/Makefile new file mode 100644 index 00000000..e04f063f --- /dev/null +++ b/scripts/helper/Makefile @@ -0,0 +1,12 @@ +CFLAGS += -g -Wall -O0 -std=c11 +LDFLAGS += -fsanitize=address + +PROGS = find-idents + +all: $(PROGS) + +find-idents: find-idents.o + $(CC) -o $@ $^ + +clean : + $(RM) $(PROGS) *.o diff --git a/scripts/helper/find-idents.c b/scripts/helper/find-idents.c new file mode 100644 index 00000000..69556ae4 --- /dev/null +++ b/scripts/helper/find-idents.c @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Helper to find identifier names in C files + * + * Copyright (C) Johan Malm 2023 + * + * It tokenizes the specified C file and searches all identifier-tokens against + * the specified patterns. + * + * An identifier in this context is any alphanumeric/underscore string starting + * with a letter [A-Za-z] or underscore. It represents entities such as + * functions, variables, user-defined data types and C language keywords. + * Alphanumeric strings within comments are ignored, but not parsing of tokens + * is carried out to understand their semantic meaning. + */ + +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include + +struct buf { + char *buf; + int alloc; + int len; +}; + +enum token_kind { + TOKEN_NONE = 0, + TOKEN_IDENTIFIER, /* For example: static extern if while */ + TOKEN_LITERAL, /* For example: 0xff 42 "foo" */ + TOKEN_SPECIAL, /* For example: ++ -= ! ... */ +}; + +struct token { + int line; + enum token_kind kind; + struct buf name; + unsigned int special; +}; + +enum { + SPECIAL_ELLIPSIS = 256, + SPECIAL_ASSIGN, + SPECIAL_BIT_OP, + SPECIAL_INC_OP, + SPECIAL_DEC_OP, + SPECIAL_PTR_OP, + SPECIAL_AND_OP, + SPECIAL_OR_OP, + SPECIAL_COMPARISON_OP, + SPECIAL_COMMENT_BEGIN, + SPECIAL_COMMENT_END, +}; + +static char *current_buffer_position; +static struct token *tokens; +static int nr_tokens, alloc_tokens; +static int current_line = 1; + +void +buf_init(struct buf *s) +{ + s->alloc = 256; + s->buf = malloc(s->alloc); + s->buf[0] = '\0'; + s->len = 0; +} + +void +buf_add(struct buf *s, const char *data, size_t len) +{ + if (!data || data[0] == '\0') { + return; + } + if (s->alloc <= s->len + len + 1) { + s->alloc = s->alloc + len; + s->buf = realloc(s->buf, s->alloc); + } + memcpy(s->buf + s->len, data, len); + s->len += len; + s->buf[s->len] = 0; +} + +void +buf_add_char(struct buf *s, char ch) +{ + if (s->alloc <= s->len + 1) { + s->alloc = s->alloc * 2 + 16; + s->buf = realloc(s->buf, s->alloc); + } + s->buf[s->len++] = ch; + s->buf[s->len] = 0; +} + +static struct token * +add_token(void) +{ + if (nr_tokens == alloc_tokens) { + alloc_tokens = (alloc_tokens + 16) * 2; + tokens = realloc(tokens, alloc_tokens * sizeof(struct token)); + } + struct token *token = tokens + nr_tokens; + memset(token, 0, sizeof(*token)); + nr_tokens++; + buf_init(&token->name); + token->line = current_line; + return token; +} + +static void +handle_whitespace(struct token *token) +{ + if (current_buffer_position[0] == '\n') { + ++current_line; + } + current_buffer_position++; + if (isspace(current_buffer_position[0])) { + handle_whitespace(token); + } +} + +static void +get_identifier_token(struct token *token) +{ + buf_add_char(&token->name, current_buffer_position[0]); + current_buffer_position++; + if (isspace(current_buffer_position[0])) { + handle_whitespace(token); + return; + } + switch (current_buffer_position[0]) { + case '\0': + break; + case 'a' ... 'z': + case 'A' ... 'Z': + case '0' ... '9': + case '_': + case '#': + get_identifier_token(token); + break; + default: + break; + } +} + +static void +get_number_token(struct token *token) +{ + buf_add_char(&token->name, current_buffer_position[0]); + current_buffer_position++; + if (isspace(current_buffer_position[0])) { + handle_whitespace(token); + return; + } + switch (current_buffer_position[0]) { + case '\0': + break; + case '0' ... '9': + case 'a' ... 'f': + case 'A' ... 'F': + case 'x': + get_number_token(token); + break; + default: + break; + } +} + +struct { + const char *combo; + unsigned int special; +} specials[] = { + { "...", SPECIAL_ELLIPSIS }, + { ">>=", SPECIAL_ASSIGN }, + { "<<=", SPECIAL_ASSIGN }, + { "+=", SPECIAL_ASSIGN }, + { "-=", SPECIAL_ASSIGN }, + { "*=", SPECIAL_ASSIGN }, + { "/=", SPECIAL_ASSIGN }, + { "%=", SPECIAL_ASSIGN }, + { "&=", SPECIAL_ASSIGN }, + { "^=", SPECIAL_ASSIGN }, + { "|=", SPECIAL_ASSIGN }, + { ">>", SPECIAL_BIT_OP }, + { "<<", SPECIAL_BIT_OP }, + { "++", SPECIAL_INC_OP }, + { "--", SPECIAL_DEC_OP }, + { "->", SPECIAL_PTR_OP }, + { "&&", SPECIAL_AND_OP }, + { "||", SPECIAL_OR_OP }, + { "<=", SPECIAL_COMPARISON_OP }, + { ">=", SPECIAL_COMPARISON_OP }, + { "==", SPECIAL_COMPARISON_OP }, + { "!=", SPECIAL_COMPARISON_OP }, + { "/*", SPECIAL_COMMENT_BEGIN }, + { "*/", SPECIAL_COMMENT_END }, + { ";", ';' }, + { "{", '{' }, + { "}", '}' }, + { ",", ',' }, + { ":", ':' }, + { "=", '=' }, + { "(", '(' }, + { ")", ')' }, + { "[", '[' }, + { "]", ']' }, + { ".", '.' }, + { "&", '&' }, + { "!", '!' }, + { "~", '~' }, + { "-", '-' }, + { "+", '+' }, + { "*", '*' }, + { "/", '/' }, + { "%", '%' }, + { "<", '<' }, + { ">", '>' }, + { "^", '^' }, + { "|", '|' }, + { "?", '?' }, +}; + +static void +get_special_token(struct token *token) +{ +#define MAX_SPECIAL_LEN (3) + /* Peek up to MAX_SPECIAL_LEN-1 characters ahead */ + char buf[MAX_SPECIAL_LEN + 1] = { 0 }; + for (int i = 0; i < MAX_SPECIAL_LEN; i++) { + buf[i] = current_buffer_position[i]; + if (!current_buffer_position[i]) { + break; + } + } +#undef MAX_SPECIAL_LEN + + /* Compare with longest special tokens first */ + int k; + for (k = strlen(buf); k > 0; k--) { + for (int j = 0; sizeof(specials) / sizeof(specials[0]); j++) { + if (strlen(specials[j].combo) < k) { + break; + } + if (!strcmp(specials[j].combo, buf)) { + buf_add(&token->name, buf, k); + token->special = specials[j].special; + goto done; + } + } + buf[k - 1] = '\0'; + } +done: + current_buffer_position += token->name.len; + if (isspace(current_buffer_position[0])) { + handle_whitespace(token); + } +} + +static void +handle_preprocessor_directive(void) +{ + /* We just ignore preprocessor lines */ + for (;;) { + ++current_buffer_position; + if (current_buffer_position[0] == '\0') { + return; + } + if (current_buffer_position[0] == '\n') { + ++current_line; + return; + } + } +} + +struct token * +lex(char *buffer) +{ + tokens = NULL; + nr_tokens = 0; + alloc_tokens = 0; + + current_buffer_position = buffer; + + for (;;) { + struct token *token = NULL; + switch (current_buffer_position[0]) { + case '\0': + goto out; + case 'a' ... 'z': + case 'A' ... 'Z': + case '_': + token = add_token(); + get_identifier_token(token); + token->kind = TOKEN_IDENTIFIER; + continue; + case '0' ... '9': + token = add_token(); + get_number_token(token); + token->kind = TOKEN_LITERAL; + continue; + case '+': case '-': case '*': case '/': case '%': case '.': + case '>': case '<': case '=': case '!': case '&': case '|': + case '^': case '{': case '}': case '(': case ')': case ',': + case ';': case ':': case '[': case ']': case '~': case '?': + token = add_token(); + get_special_token(token); + token->kind = TOKEN_SPECIAL; + continue; + case '#': + handle_preprocessor_directive(); + break; + case '\n': + ++current_line; + break; + default: + break; + } + ++current_buffer_position; + } +out: + add_token(); /* end marker */ + return tokens; +} + +char * +read_file(const char *filename) +{ + char *line = NULL; + size_t len = 0; + FILE *stream = fopen(filename, "r"); + if (!stream) { + fprintf(stderr, "warn: cannot read '%s'\n", filename); + return NULL; + } + struct buf buffer; + buf_init(&buffer); + while ((getline(&line, &len, stream) != -1)) { + buf_add(&buffer, line, strlen(line)); + } + free(line); + fclose(stream); + return buffer.buf; +} + +static bool +grep(struct token *tokens, const char *pattern) +{ + bool found = false; + bool in_comment = false; + + for (struct token *t = tokens; t->kind; t++) { + if (t->kind == TOKEN_SPECIAL) { + if (t->special == SPECIAL_COMMENT_BEGIN) { + in_comment = true; + } else if (t->special == SPECIAL_COMMENT_END) { + in_comment = false; + } + } + if (in_comment) { + continue; + } + if (t->kind == TOKEN_IDENTIFIER) { + if (!pattern || !strcmp(t->name.buf, pattern)) { + found = true; + printf("%d:\t%s\n", t->line, t->name.buf); + } + } + } + return found; +} + +int +main(int argc, char **argv) +{ + struct token *tokens; + int found = false; + + if (argc < 2) { + fprintf(stderr, "usage: %s [...]\n", argv[0]); + return EXIT_FAILURE; + } + + char *buffer = read_file(argv[1]); + if (!buffer) { + return EXIT_FAILURE; + } + tokens = lex(buffer); + free(buffer); + + if (argc == 2) { + /* Dump all idents */ + grep(tokens, NULL); + } else { + for (int i = 2; i < argc; ++i) { + found |= grep(tokens, argv[i]); + } + } + + /* return failure (1) if we have found a banned identifier */ + return found; +}