labwc/scripts/helper/find-idents.c

480 lines
9.4 KiB
C
Raw Normal View History

2023-01-30 21:24:52 +00:00
// SPDX-License-Identifier: GPL-2.0-only
/*
* Helper to find identifier names in C files
*
* Copyright (C) Johan Malm 2023
*
* It tokenizes the specified C file and searches all identifier-tokens against
* the specified patterns.
*
* An identifier in this context is any alphanumeric/underscore string starting
* with a letter [A-Za-z] or underscore. It represents entities such as
* functions, variables, user-defined data types and C language keywords.
* Alphanumeric strings within comments are ignored, but not parsing of tokens
* is carried out to understand their semantic meaning.
*/
#define _POSIX_C_SOURCE 200809L
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct buf {
char *buf;
int alloc;
int len;
};
enum token_kind {
TOKEN_NONE = 0,
TOKEN_IDENTIFIER, /* For example: static extern if while */
TOKEN_LITERAL, /* For example: 0xff 42 "foo" */
TOKEN_SPECIAL, /* For example: ++ -= ! ... */
};
struct token {
int line;
enum token_kind kind;
struct buf name;
unsigned int special;
};
enum {
SPECIAL_ELLIPSIS = 256,
SPECIAL_ASSIGN,
SPECIAL_BIT_OP,
SPECIAL_INC_OP,
SPECIAL_DEC_OP,
SPECIAL_PTR_OP,
SPECIAL_AND_OP,
SPECIAL_OR_OP,
SPECIAL_COMPARISON_OP,
SPECIAL_COMMENT_BEGIN,
SPECIAL_COMMENT_END,
SPECIAL_COMMENT_LINE_BEGIN,
2023-01-30 21:24:52 +00:00
};
static char *current_buffer_position;
static struct token *tokens;
static int nr_tokens, alloc_tokens;
static int current_line;
static char **argv_tokens;
static int found_token;
static const char find_banned_usage[] =
"Usage: find-banned [OPTIONS...] FILE\n"
"When FILE is -, read stdin\n"
"OPTIONS:\n"
" --tokens=<tokens> Comma-separated string of idents to grep for\n";
static void
usage(void)
{
printf("%s", find_banned_usage);
exit(0);
}
char **
split(char *str, char delim)
{
if (!str) {
return NULL;
}
int argc = 1;
char *p = str;
while (*p) {
if (*p == delim) {
argc++;
}
++p;
}
char **argv = calloc(argc + 1, sizeof(*argv));
char **argvp = argv;
p = str;
while (*str) {
if (*str == delim) {
*argvp++ = strndup(p, str-p);
p = str + 1;
}
++str;
}
*argvp++ = strndup(p, str-p);
*argvp = NULL;
return argv;
}
2023-01-30 21:24:52 +00:00
void
buf_init(struct buf *s)
{
s->alloc = 256;
s->buf = malloc(s->alloc);
s->buf[0] = '\0';
s->len = 0;
}
void
buf_add(struct buf *s, const char *data, size_t len)
{
if (!data || data[0] == '\0') {
return;
}
if (s->alloc <= s->len + len + 1) {
s->alloc = s->alloc + len;
s->buf = realloc(s->buf, s->alloc);
}
memcpy(s->buf + s->len, data, len);
s->len += len;
s->buf[s->len] = 0;
}
void
buf_add_char(struct buf *s, char ch)
{
if (s->alloc <= s->len + 1) {
s->alloc = s->alloc * 2 + 16;
s->buf = realloc(s->buf, s->alloc);
}
s->buf[s->len++] = ch;
s->buf[s->len] = 0;
}
static struct token *
add_token(void)
{
if (nr_tokens == alloc_tokens) {
alloc_tokens = (alloc_tokens + 16) * 2;
tokens = realloc(tokens, alloc_tokens * sizeof(struct token));
}
struct token *token = tokens + nr_tokens;
memset(token, 0, sizeof(*token));
nr_tokens++;
buf_init(&token->name);
token->line = current_line;
return token;
}
static void
get_identifier_token(struct token *token)
{
buf_add_char(&token->name, current_buffer_position[0]);
current_buffer_position++;
if (isspace(current_buffer_position[0])) {
return;
}
switch (current_buffer_position[0]) {
case '\0':
break;
case 'a' ... 'z':
case 'A' ... 'Z':
case '0' ... '9':
case '_':
case '#':
get_identifier_token(token);
break;
default:
break;
}
}
static void
get_number_token(struct token *token)
{
buf_add_char(&token->name, current_buffer_position[0]);
current_buffer_position++;
if (isspace(current_buffer_position[0])) {
return;
}
switch (current_buffer_position[0]) {
case '\0':
break;
case '0' ... '9':
case 'a' ... 'f':
case 'A' ... 'F':
case 'x':
get_number_token(token);
break;
default:
break;
}
}
struct {
const char *combo;
unsigned int special;
} specials[] = {
{ "...", SPECIAL_ELLIPSIS },
{ ">>=", SPECIAL_ASSIGN },
{ "<<=", SPECIAL_ASSIGN },
{ "+=", SPECIAL_ASSIGN },
{ "-=", SPECIAL_ASSIGN },
{ "*=", SPECIAL_ASSIGN },
{ "/=", SPECIAL_ASSIGN },
{ "%=", SPECIAL_ASSIGN },
{ "&=", SPECIAL_ASSIGN },
{ "^=", SPECIAL_ASSIGN },
{ "|=", SPECIAL_ASSIGN },
{ ">>", SPECIAL_BIT_OP },
{ "<<", SPECIAL_BIT_OP },
{ "++", SPECIAL_INC_OP },
{ "--", SPECIAL_DEC_OP },
{ "->", SPECIAL_PTR_OP },
{ "&&", SPECIAL_AND_OP },
{ "||", SPECIAL_OR_OP },
{ "<=", SPECIAL_COMPARISON_OP },
{ ">=", SPECIAL_COMPARISON_OP },
{ "==", SPECIAL_COMPARISON_OP },
{ "!=", SPECIAL_COMPARISON_OP },
{ "/*", SPECIAL_COMMENT_BEGIN },
{ "*/", SPECIAL_COMMENT_END },
{ "//", SPECIAL_COMMENT_LINE_BEGIN },
2023-01-30 21:24:52 +00:00
{ ";", ';' },
{ "{", '{' },
{ "}", '}' },
{ ",", ',' },
{ ":", ':' },
{ "=", '=' },
{ "(", '(' },
{ ")", ')' },
{ "[", '[' },
{ "]", ']' },
{ ".", '.' },
{ "&", '&' },
{ "!", '!' },
{ "~", '~' },
{ "-", '-' },
{ "+", '+' },
{ "*", '*' },
{ "/", '/' },
{ "%", '%' },
{ "<", '<' },
{ ">", '>' },
{ "^", '^' },
{ "|", '|' },
{ "?", '?' },
};
static void
get_special_token(struct token *token)
{
#define MAX_SPECIAL_LEN (3)
/* Peek up to MAX_SPECIAL_LEN-1 characters ahead */
char buf[MAX_SPECIAL_LEN + 1] = { 0 };
for (int i = 0; i < MAX_SPECIAL_LEN; i++) {
buf[i] = current_buffer_position[i];
if (!current_buffer_position[i]) {
break;
}
}
#undef MAX_SPECIAL_LEN
/* Compare with longest special tokens first */
int k;
for (k = strlen(buf); k > 0; k--) {
for (size_t j = 0; j < sizeof(specials) / sizeof(specials[0]); j++) {
2023-01-30 21:24:52 +00:00
if (strlen(specials[j].combo) < k) {
break;
}
if (!strcmp(specials[j].combo, buf)) {
buf_add(&token->name, buf, k);
token->special = specials[j].special;
goto done;
}
}
buf[k - 1] = '\0';
}
done:
current_buffer_position += token->name.len;
}
static void
handle_preprocessor_directive(void)
{
/* We just ignore preprocessor lines */
for (;;) {
++current_buffer_position;
if (current_buffer_position[0] == '\0') {
return;
}
if (current_buffer_position[0] == '\n') {
++current_line;
return;
}
}
}
struct token *
lex(char *buffer)
{
tokens = NULL;
nr_tokens = 0;
alloc_tokens = 0;
bool in_single_comment = false;
2023-01-30 21:24:52 +00:00
current_buffer_position = buffer;
for (;;) {
struct token *token = NULL;
switch (current_buffer_position[0]) {
case '\0':
goto out;
case 'a' ... 'z':
case 'A' ... 'Z':
case '_':
token = add_token();
get_identifier_token(token);
token->kind = TOKEN_IDENTIFIER;
continue;
case '0' ... '9':
token = add_token();
get_number_token(token);
token->kind = TOKEN_LITERAL;
continue;
case '+': case '-': case '*': case '/': case '%': case '.':
case '>': case '<': case '=': case '!': case '&': case '|':
case '^': case '{': case '}': case '(': case ')': case ',':
case ';': case ':': case '[': case ']': case '~': case '?':
token = add_token();
get_special_token(token);
token->kind = TOKEN_SPECIAL;
if (token->special == SPECIAL_COMMENT_LINE_BEGIN) {
token->special = SPECIAL_COMMENT_BEGIN;
in_single_comment = true;
}
2023-01-30 21:24:52 +00:00
continue;
case '#':
handle_preprocessor_directive();
break;
case '\n':
if (in_single_comment) {
token = add_token();
token->kind = TOKEN_SPECIAL;
token->special = SPECIAL_COMMENT_END;
in_single_comment = false;
}
2023-01-30 21:24:52 +00:00
++current_line;
break;
default:
break;
}
++current_buffer_position;
}
out:
add_token(); /* end marker */
return tokens;
}
char *
read_file(const char *filename)
{
char *line = NULL;
size_t len = 0;
FILE *stream = fopen(filename, "r");
if (!stream) {
fprintf(stderr, "warn: cannot read '%s'\n", filename);
return NULL;
}
struct buf buffer;
buf_init(&buffer);
while ((getline(&line, &len, stream) != -1)) {
buf_add(&buffer, line, strlen(line));
}
free(line);
fclose(stream);
return buffer.buf;
}
static bool
grep(struct token *tokens, const char *filename, const char *pattern)
2023-01-30 21:24:52 +00:00
{
bool found = false;
unsigned int in_comment = 0;
2023-01-30 21:24:52 +00:00
for (struct token *t = tokens; t->kind; t++) {
if (t->kind == TOKEN_SPECIAL) {
if (t->special == SPECIAL_COMMENT_BEGIN) {
++in_comment;
2023-01-30 21:24:52 +00:00
} else if (t->special == SPECIAL_COMMENT_END) {
--in_comment;
2023-01-30 21:24:52 +00:00
}
}
if (in_comment) {
continue;
}
if (t->kind == TOKEN_IDENTIFIER) {
if (!pattern || !strcmp(t->name.buf, pattern)) {
found = true;
printf("%s:%d\t%s\n", filename, t->line, t->name.buf);
2023-01-30 21:24:52 +00:00
}
}
}
return found;
}
static void
process_one_file(const char *filename)
2023-01-30 21:24:52 +00:00
{
struct token *tokens;
char *buffer = read_file(filename);
2023-01-30 21:24:52 +00:00
if (!buffer) {
exit(EXIT_FAILURE);
2023-01-30 21:24:52 +00:00
}
current_line = 1;
2023-01-30 21:24:52 +00:00
tokens = lex(buffer);
free(buffer);
if (!argv_tokens) {
2023-01-30 21:24:52 +00:00
/* Dump all idents */
grep(tokens, filename, NULL);
2023-01-30 21:24:52 +00:00
} else {
for (char **p = argv_tokens; *p; p++) {
found_token |= grep(tokens, filename, *p);
}
}
}
int
main(int argc, char **argv)
{
if (argc < 2) {
usage();
}
for (int i = 1; i < argc; ++i) {
char *arg = argv[i];
if (!strncmp(arg, "--tokens=", 9)) {
argv_tokens = split(arg + 9, ',');
}
if (!strcmp(arg, "-")) {
char *line = NULL;
size_t len = 0;
while ((getline(&line, &len, stdin) != -1)) {
char *p = strrchr(line, '\n');
if (p) {
*p = '\0';
}
process_one_file(line);
}
free(line);
break;
}
if (arg[0] != '-') {
process_one_file(arg);
break;
2023-01-30 21:24:52 +00:00
}
}
if (argv_tokens) {
for (char **p = argv_tokens; *p; p++) {
free(*p);
}
free(argv_tokens);
}
2023-01-30 21:24:52 +00:00
/* return failure (1) if we have found a banned identifier */
return found_token;
2023-01-30 21:24:52 +00:00
}