mirror of
https://git.sr.ht/~rjarry/aerc
synced 2025-02-22 14:53:57 +01:00

Use codespell to fix typos in code, comments and man pages. Signed-off-by: Robin Jarry <robin@jarry.cc> Reviewed-by: Bence Ferdinandy <bence@ferdinandy.com> Acked-by: inwit <inwit@sindominio.net>
584 lines
13 KiB
C
584 lines
13 KiB
C
/* SPDX-License-Identifier: MIT */
|
|
/* Copyright (c) 2023 Robin Jarry */
|
|
|
|
#define _XOPEN_SOURCE 700
|
|
#include <errno.h>
|
|
#include <getopt.h>
|
|
#include <langinfo.h>
|
|
#include <locale.h>
|
|
#include <regex.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <wchar.h>
|
|
#include <wctype.h>
|
|
|
|
#ifdef __APPLE__
|
|
#include <xlocale.h>
|
|
#endif
|
|
|
|
static void usage(void)
|
|
{
|
|
puts("usage: wrap [-h] [-w INT] [-r] [-l INT] [-f FILE]");
|
|
puts("");
|
|
puts("Wrap text without messing up email quotes.");
|
|
puts("");
|
|
puts("options:");
|
|
puts(" -h show this help message");
|
|
puts(" -w INT preferred wrap margin (default 80)");
|
|
puts(" -r reflow all paragraphs even if no trailing space");
|
|
puts(" -l INT minimum percentage of letters in a line to be");
|
|
puts(" considered a paragraph");
|
|
puts(" -f FILE read from filename (default stdin)");
|
|
}
|
|
|
|
static size_t margin = 80;
|
|
static size_t prose_ratio = 50;
|
|
static bool reflow;
|
|
static FILE *in_file;
|
|
|
|
static int parse_args(int argc, char **argv)
|
|
{
|
|
const char *filename = NULL;
|
|
long value;
|
|
int c;
|
|
|
|
while ((c = getopt(argc, argv, "hrw:l:f:")) != -1) {
|
|
errno = 0;
|
|
switch (c) {
|
|
case 'r':
|
|
reflow = true;
|
|
break;
|
|
case 'l':
|
|
value = strtol(optarg, NULL, 10);
|
|
if (errno) {
|
|
perror("error: invalid ratio value");
|
|
return 1;
|
|
}
|
|
if (value <= 0 || value >= 100) {
|
|
fprintf(stderr, "error: ratio must be ]0,100[\n");
|
|
return 1;
|
|
}
|
|
prose_ratio = (size_t)value;
|
|
break;
|
|
case 'w':
|
|
value = strtol(optarg, NULL, 10);
|
|
if (errno) {
|
|
perror("error: invalid width value");
|
|
return 1;
|
|
}
|
|
if (value < 1) {
|
|
fprintf(stderr, "error: width must be positive\n");
|
|
return 1;
|
|
}
|
|
margin = (size_t)value;
|
|
break;
|
|
case 'f':
|
|
filename = optarg;
|
|
break;
|
|
default:
|
|
usage();
|
|
return 1;
|
|
}
|
|
}
|
|
if (optind < argc) {
|
|
fprintf(stderr, "%s: unexpected argument -- '%s'\n",
|
|
argv[0], argv[optind]);
|
|
usage();
|
|
return 1;
|
|
}
|
|
if (filename == NULL || !strcmp(filename, "-")) {
|
|
in_file = stdin;
|
|
} else {
|
|
in_file = fopen(filename, "r");
|
|
if (!in_file) {
|
|
perror("error: cannot open file");
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static bool is_empty(const wchar_t *s)
|
|
{
|
|
while (*s != L'\0') {
|
|
if (!iswspace((wint_t)*s++))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
__attribute__((malloc,returns_nonnull))
|
|
static void *xmalloc(size_t s)
|
|
{
|
|
void *ptr = malloc(s);
|
|
if (ptr == NULL) {
|
|
perror("fatal: cannot allocate buffer");
|
|
abort();
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
__attribute__((malloc,returns_nonnull))
|
|
static void *xrealloc(void *ptr, size_t s)
|
|
{
|
|
ptr = realloc(ptr, s);
|
|
if (ptr == NULL) {
|
|
perror("fatal: cannot reallocate buffer");
|
|
abort();
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
struct paragraph {
|
|
/* email quote prefix, if any */
|
|
wchar_t *quotes;
|
|
/* list item indent, if any */
|
|
wchar_t *indent;
|
|
/* actual text of this paragraph */
|
|
wchar_t *text;
|
|
/* percentage of letters in text */
|
|
size_t prose_ratio;
|
|
/* text ends with a space */
|
|
bool flowed;
|
|
/* paragraph is a list item */
|
|
bool list_item;
|
|
};
|
|
|
|
static void free_paragraph(struct paragraph *p)
|
|
{
|
|
if (!p)
|
|
return;
|
|
free(p->quotes);
|
|
free(p->indent);
|
|
free(p->text);
|
|
free(p);
|
|
}
|
|
|
|
static wchar_t *read_part(const wchar_t *in, size_t len)
|
|
{
|
|
wchar_t *out = xmalloc((len + 1) * sizeof(wchar_t));
|
|
wcsncpy(out, in, len);
|
|
out[len] = L'\0';
|
|
return out;
|
|
}
|
|
|
|
static size_t list_item_offset(const wchar_t *buf)
|
|
{
|
|
size_t i = 0;
|
|
wchar_t c;
|
|
|
|
if (buf[i] == L'-' || buf[i] == '*' || buf[i] == '.') {
|
|
/* bullet list */
|
|
i++;
|
|
} else if (iswdigit((wint_t)buf[i])) {
|
|
/* numbered list */
|
|
i++;
|
|
if (iswdigit((wint_t)buf[i])) {
|
|
i++;
|
|
}
|
|
} else if (iswalpha((wint_t)buf[i])) {
|
|
/* lettered list */
|
|
c = (wchar_t)towlower((wint_t)buf[i]);
|
|
i++;
|
|
if (c == L'i' || c == L'v') {
|
|
/* roman i. ii. iii. iv. ... */
|
|
c = (wchar_t)towlower((wint_t)buf[i]);
|
|
while (i < 4 && (c == L'i' || c == L'v')) {
|
|
c = (wchar_t)towlower((wint_t)buf[++i]);
|
|
}
|
|
}
|
|
} else {
|
|
return 0;
|
|
}
|
|
if (iswdigit((wint_t)buf[0]) || iswalpha((wint_t)buf[0])) {
|
|
if (buf[i] == L')' || buf[i] == L'/' || buf[i] == L'.') {
|
|
i++;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
if (buf[i] == L' ') {
|
|
i++;
|
|
} else {
|
|
return 0;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
static bool is_cjk(wchar_t c, bool include_syllables) {
|
|
/* CJK Radicals Supplement */
|
|
if (c >= 0x2e80 && c <= 0x2fd5)
|
|
return true;
|
|
/* CJK Compatibility */
|
|
if (c >= 0x3300 && c <= 0x33ff)
|
|
return true;
|
|
/* CJK Unified Ideographs Extension A */
|
|
if (c >= 0x3400 && c <= 0x4db5)
|
|
return true;
|
|
/* CJK Unified Ideographs */
|
|
if (c >= 0x4e00 && c <= 0x9fcb)
|
|
return true;
|
|
/* CJK Compatibility Ideographs */
|
|
if (c >= 0xf900 && c <= 0xfa6a)
|
|
return true;
|
|
/* Hangul Jamo */
|
|
if (c >= 0x1100 && c <= 0x11ff)
|
|
return true;
|
|
/* Hangul Compatibility Jamo */
|
|
if (c >= 0x3130 && c <= 0x318f)
|
|
return true;
|
|
/* Hangul Jamo Extended-A */
|
|
if (c >= 0xa960 && c <= 0xa97f)
|
|
return true;
|
|
/* Hangul Jamo Extended-B */
|
|
if (c >= 0xd7b0 && c <= 0xd7ff)
|
|
return true;
|
|
|
|
if (include_syllables) {
|
|
/* Japanese Hiragana */
|
|
if (c >= 0x3040 && c <= 0x309f)
|
|
return true;
|
|
/* Japanese Katakana */
|
|
if (c >= 0x30a0 && c <= 0x30ff)
|
|
return true;
|
|
/* Hangul Syllables */
|
|
if (c >= 0xac00 && c <= 0xd7af)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static struct paragraph *parse_line(const wchar_t *buf)
|
|
{
|
|
size_t i, q, t, e, letters, indent_len, text_len;
|
|
bool list_item, flowed;
|
|
struct paragraph *p;
|
|
|
|
/*
|
|
* Find relevant positions in the line:
|
|
*
|
|
* '> > > > 2) blah blah blah blah '
|
|
* ^ ^ ^ ^
|
|
* 0 q t e
|
|
* <------><------------->
|
|
* quotes indent
|
|
* <-------------------------------->
|
|
* text
|
|
*/
|
|
|
|
/* detect the end of quotes prefix if any */
|
|
q = 0;
|
|
while (buf[q] == L'>') {
|
|
q++;
|
|
if (buf[q] == L' ') {
|
|
q++;
|
|
}
|
|
}
|
|
/* detect list item prefix & indent */
|
|
t = q;
|
|
while (iswspace((wint_t)buf[t])) {
|
|
t++;
|
|
}
|
|
i = list_item_offset(&buf[t]);
|
|
list_item = i != 0;
|
|
t += i;
|
|
while (iswspace((wint_t)buf[t])) {
|
|
t++;
|
|
}
|
|
indent_len = t - q;
|
|
/* compute prose ratio */
|
|
e = t;
|
|
letters = 0;
|
|
while (buf[e] != L'\0') {
|
|
wchar_t c = buf[e++];
|
|
if (iswalpha((wint_t)c) || is_cjk(c, true)) {
|
|
letters++;
|
|
}
|
|
}
|
|
/* strip trailing whitespace unless it is a signature delimiter */
|
|
flowed = false;
|
|
if (wcscmp(&buf[q], L"-- ") != 0) {
|
|
while (e > q && iswspace((wint_t)buf[e - 1])) {
|
|
e--;
|
|
flowed = true;
|
|
}
|
|
}
|
|
text_len = e - q;
|
|
|
|
p = xmalloc(sizeof(*p));
|
|
memset(p, 0, sizeof(*p));
|
|
p->quotes = read_part(buf, q);
|
|
p->indent = xmalloc((indent_len + 1) * sizeof(wchar_t));
|
|
for (i = 0; i < indent_len; i++)
|
|
p->indent[i] = L' ';
|
|
p->indent[i] = L'\0';
|
|
p->text = read_part(&buf[q], text_len);
|
|
p->flowed = flowed;
|
|
p->list_item = list_item;
|
|
p->prose_ratio = 100 * letters / (text_len ? text_len : 1);
|
|
|
|
return p;
|
|
}
|
|
|
|
static bool is_continuation(
|
|
const struct paragraph *p, const struct paragraph *next
|
|
) {
|
|
if (next->list_item)
|
|
/* new list items always start a new paragraph */
|
|
return false;
|
|
if (next->prose_ratio < prose_ratio || p->prose_ratio < prose_ratio)
|
|
/* does not look like prose, maybe ascii art */
|
|
return false;
|
|
if (wcscmp(next->quotes, p->quotes) != 0)
|
|
/* quote prefix has changed */
|
|
return false;
|
|
if (wcscmp(next->indent, p->indent) != 0)
|
|
/* list item indent has changed */
|
|
return false;
|
|
if (is_empty(next->text))
|
|
/* empty or whitespace only line */
|
|
return false;
|
|
if (wcscmp(p->text, L"--") == 0 || wcscmp(p->text, L"-- ") == 0)
|
|
/* never join anything with signature start */
|
|
return false;
|
|
if (p->flowed)
|
|
/* current paragraph has trailing space, indicating
|
|
* format=flowed */
|
|
return true;
|
|
if (reflow)
|
|
/* user forced paragraph reflow on the command line */
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
static void join_paragraph(
|
|
struct paragraph *p, const struct paragraph *next
|
|
) {
|
|
const wchar_t *append = next->text;
|
|
const wchar_t *separator = L" ";
|
|
size_t len, extra_len;
|
|
wchar_t *text;
|
|
|
|
/* trim leading whitespace of the next paragraph before joining */
|
|
while (*append != L'\0' && iswspace((wint_t)*append))
|
|
append++;
|
|
|
|
len = wcslen(p->text);
|
|
if (len == 0) {
|
|
separator = L"";
|
|
}
|
|
extra_len = wcslen(separator) + wcslen(append) + 1;
|
|
|
|
text = xrealloc(p->text, (len + extra_len) * sizeof(wchar_t));
|
|
swprintf(&text[len], extra_len, L"%ls%ls", separator, append);
|
|
|
|
p->text = text;
|
|
p->prose_ratio = (p->prose_ratio + next->prose_ratio) / 2;
|
|
p->flowed = next->flowed;
|
|
}
|
|
|
|
/*
|
|
* BUFSIZ has different values depending on the libc implementation.
|
|
* Use a self defined value to have consistent behaviour across all platforms.
|
|
*/
|
|
#define BUFFER_SIZE 8192
|
|
|
|
/*
|
|
* Check if a line can be split at the given character point.
|
|
*/
|
|
static bool is_split_point(const wchar_t c)
|
|
{
|
|
if (iswspace((wint_t)c))
|
|
return true;
|
|
|
|
if (is_cjk(c, false))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Write a paragraph, wrapping at words boundaries.
|
|
*
|
|
* Only try to do word wrapping on things that look like prose. When the text
|
|
* contains too many non-letter characters, print it as-is.
|
|
*/
|
|
static void write_paragraph(struct paragraph *p)
|
|
{
|
|
size_t quotes_width = (size_t)wcswidth(p->quotes, wcslen(p->quotes));
|
|
size_t remain = (size_t)wcswidth(p->text, wcslen(p->text));
|
|
const wchar_t *indent = L"";
|
|
wchar_t *text = p->text;
|
|
bool more = true;
|
|
int wchar_count;
|
|
wchar_t *line;
|
|
size_t width;
|
|
|
|
while (more) {
|
|
width = quotes_width + (size_t)wcswidth(indent, wcslen(indent));
|
|
|
|
if (width + remain <= margin || p->prose_ratio < prose_ratio) {
|
|
/* whole paragraph fits on a single line */
|
|
line = text;
|
|
wchar_count = (int)wcslen(text);
|
|
more = false;
|
|
} else {
|
|
/* find split point, preferably before margin */
|
|
size_t split = SIZE_MAX;
|
|
size_t w = 0;
|
|
for (size_t i = 0; text[i] != L'\0'; i++) {
|
|
w += (size_t)wcwidth(text[i]);
|
|
if (width + w > margin && split != SIZE_MAX) {
|
|
break;
|
|
}
|
|
if (is_split_point(text[i])) {
|
|
split = i;
|
|
}
|
|
}
|
|
if (split == SIZE_MAX) {
|
|
/* no space found to split, print a long line */
|
|
line = text;
|
|
wchar_count = (int)wcslen(text);
|
|
more = false;
|
|
} else {
|
|
wchar_count = (int)split;
|
|
line = text;
|
|
/* find start of next word */
|
|
while (iswspace((wint_t)text[split])) {
|
|
split++;
|
|
}
|
|
if (text[split] != L'\0') {
|
|
remain -= (size_t)wcswidth(text, split);
|
|
text = &text[split];
|
|
} else {
|
|
/* only trailing whitespace, we're done */
|
|
more = false;
|
|
}
|
|
}
|
|
}
|
|
wprintf(L"%ls%ls%.*ls\n", p->quotes, indent, wchar_count, line);
|
|
indent = p->indent;
|
|
}
|
|
}
|
|
|
|
#define SPACES_PER_TAB 8
|
|
|
|
/*
|
|
* Trim LF CR CRLF LFCR and replace tabs with spaces.
|
|
*/
|
|
static void sanitize_line(const wchar_t *in, wchar_t *out)
|
|
{
|
|
/* No bounds checking needed. This function is only used with
|
|
* 'buf' and 'line' buffers from main. 'out' is large enough no
|
|
* matter what is present in 'in'. */
|
|
while (*in != L'\0' && *in != L'\n' && *in != L'\r') {
|
|
if (*in == L'\t') {
|
|
/* tabs cause indentation/alignment issues
|
|
* replace them with 8 spaces */
|
|
in++;
|
|
for (int i = 0; i < SPACES_PER_TAB; i++)
|
|
*out++ = L' ';
|
|
} else {
|
|
*out++ = *in++;
|
|
}
|
|
}
|
|
*out = L'\0';
|
|
}
|
|
|
|
static int set_stdio_encoding(void)
|
|
{
|
|
const char *locale = setlocale(LC_ALL, "");
|
|
|
|
if (!locale) {
|
|
/* Neither LC_ALL nor LANG env vars are defined or are set to
|
|
* a non existent/installed locale. Try with a generic UTF-8
|
|
* locale which is expected to be available on all POSIX
|
|
* systems. */
|
|
locale = setlocale(LC_ALL, "C.UTF-8");
|
|
if (!locale) {
|
|
/* The system is not following POSIX standards. Last
|
|
* resort: check if 'UTF-8' (encoding only) exists. */
|
|
locale = setlocale(LC_CTYPE, "UTF-8");
|
|
}
|
|
}
|
|
if (!locale) {
|
|
perror("error: failed to set locale");
|
|
return 1;
|
|
}
|
|
|
|
/* aerc will always send UTF-8 text, ensure that we read that properly */
|
|
locale_t loc = newlocale(LC_ALL_MASK, locale, NULL);
|
|
char *codeset = nl_langinfo_l(CODESET, loc);
|
|
freelocale(loc);
|
|
if (!strstr(codeset, "UTF-8")) {
|
|
fprintf(stderr, "error: locale '%s' is not UTF-8\n", locale);
|
|
return 1;
|
|
}
|
|
|
|
/* ensure files are configured to read/write wide characters */
|
|
fwide(in_file, true);
|
|
fwide(stdout, true);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
/* line needs to be 8 times larger than buf since every read character
|
|
* may be a tab (very unlikely, but it could happen). */
|
|
static wchar_t buf[BUFFER_SIZE], line[BUFFER_SIZE * SPACES_PER_TAB];
|
|
struct paragraph *cur = NULL, *next;
|
|
bool is_patch = false;
|
|
regmatch_t groups[2];
|
|
char *subject;
|
|
regex_t re;
|
|
int err;
|
|
|
|
err = parse_args(argc, argv);
|
|
if (err)
|
|
goto end;
|
|
|
|
regcomp(&re, "\\<PATCH\\>", REG_EXTENDED);
|
|
subject = getenv("AERC_SUBJECT");
|
|
if (subject && !regexec(&re, subject, 2, groups, 0))
|
|
is_patch = true;
|
|
regfree(&re);
|
|
|
|
err = set_stdio_encoding();
|
|
if (err)
|
|
goto end;
|
|
|
|
while (fgetws(buf, BUFFER_SIZE, in_file)) {
|
|
if (is_patch) {
|
|
/* never reflow patches */
|
|
fputws(buf, stdout);
|
|
continue;
|
|
}
|
|
sanitize_line(buf, line);
|
|
next = parse_line(line);
|
|
if (!cur) {
|
|
cur = next;
|
|
} else if (is_continuation(cur, next)) {
|
|
join_paragraph(cur, next);
|
|
free_paragraph(next);
|
|
} else {
|
|
write_paragraph(cur);
|
|
free_paragraph(cur);
|
|
cur = next;
|
|
}
|
|
}
|
|
if (cur) {
|
|
write_paragraph(cur);
|
|
}
|
|
|
|
end:
|
|
free_paragraph(cur);
|
|
if (in_file) {
|
|
fclose(in_file);
|
|
}
|
|
return err;
|
|
}
|