Skip to content

Commit

Permalink
added -k/--convert-links and -K/--backup-converted
Browse files Browse the repository at this point in the history
  • Loading branch information
rockdaboot committed Jul 17, 2014
1 parent 8d20965 commit 7f131b0
Show file tree
Hide file tree
Showing 10 changed files with 261 additions and 35 deletions.
2 changes: 1 addition & 1 deletion examples/print_html_urls.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ static void html_parse_localfile(const char *fname)
char *data;

if ((data = mget_read_file(fname, NULL))) {
MGET_HTML_PARSE_RESULT *res = mget_html_get_urls_inline(data);
MGET_HTML_PARSED_RESULT *res = mget_html_get_urls_inline(data);

if (res->encoding)
printf("URI encoding '%s'\n", res->encoding);
Expand Down
16 changes: 9 additions & 7 deletions include/libmget.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ void
*/

void *
mget_memdup(const void *s, size_t n) G_GNUC_MGET_MALLOC G_GNUC_MGET_ALLOC_SIZE(2);
mget_memdup(const void *s, size_t n) G_GNUC_MGET_ALLOC_SIZE(2);
char *
mget_strdup(const char *s) G_GNUC_MGET_MALLOC;

Expand Down Expand Up @@ -758,7 +758,7 @@ typedef struct mget_iri_st {
const char *
connection_part; // helper, e.g. http://www.example.com:8080
size_t
dirlen; // length of directory part in 'path' (needed/initialized on with --no-parent)
dirlen; // length of directory part in 'path' (needed/initialized with --no-parent)
char
host_allocated; // if set, free host in iri_free()
} mget_iri_t;
Expand Down Expand Up @@ -786,9 +786,11 @@ int
int
mget_iri_compare(mget_iri_t *iri1, mget_iri_t *iri2) G_GNUC_MGET_PURE G_GNUC_MGET_NONNULL_ALL;
mget_iri_t *
mget_iri_parse(const char *uri, const char *encoding) G_GNUC_MGET_MALLOC;
mget_iri_parse(const char *uri, const char *encoding);
mget_iri_t *
mget_iri_parse_base(mget_iri_t *base, const char *url, const char *encoding) G_GNUC_MGET_MALLOC;
mget_iri_parse_base(mget_iri_t *base, const char *url, const char *encoding);
mget_iri_t *
mget_iri_clone(mget_iri_t *iri);
const char *
mget_iri_get_connection_part(mget_iri_t *iri);
const char *
Expand Down Expand Up @@ -977,12 +979,12 @@ typedef struct {
base;
char
follow;
} MGET_HTML_PARSE_RESULT;
} MGET_HTML_PARSED_RESULT;

MGET_HTML_PARSE_RESULT *
MGET_HTML_PARSED_RESULT *
mget_html_get_urls_inline(const char *html);
void
mget_html_free_urls_inline(MGET_HTML_PARSE_RESULT **res);
mget_html_free_urls_inline(MGET_HTML_PARSED_RESULT **res);

void
mget_sitemap_get_urls_inline(const char *sitemap, mget_vector_t **urls, mget_vector_t **sitemap_urls);
Expand Down
1 change: 1 addition & 0 deletions libmget/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ void mget_buffer_free_data(mget_buffer_t *buf)
void mget_buffer_reset(mget_buffer_t *buf)
{
buf->length = 0;
*buf->data = 0;
}

size_t mget_buffer_memcpy(mget_buffer_t *buf, const void *data, size_t length)
Expand Down
10 changes: 5 additions & 5 deletions libmget/html_url.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#include "private.h"

typedef struct {
MGET_HTML_PARSE_RESULT
MGET_HTML_PARSED_RESULT
result;
char
found_robots,
Expand Down Expand Up @@ -90,7 +90,7 @@ static void _html_get_url(void *context, int flags, const char *dir, const char
}

if ((flags & XML_FLG_ATTRIBUTE) && val) {
MGET_HTML_PARSE_RESULT *res = &ctx->result;
MGET_HTML_PARSED_RESULT *res = &ctx->result;

// info_printf("%02X %s %s '%.*s' %zd %zd\n", flags, dir, attr, (int) len, val, len, pos);

Expand Down Expand Up @@ -194,7 +194,7 @@ static void _urls_to_absolute(MGET_VECTOR *urls, MGET_IRI *base)
}
*/

void mget_html_free_urls_inline(MGET_HTML_PARSE_RESULT **res)
void mget_html_free_urls_inline (MGET_HTML_PARSED_RESULT **res)
{
if (res && *res) {
xfree((*res)->encoding);
Expand All @@ -203,12 +203,12 @@ void mget_html_free_urls_inline(MGET_HTML_PARSE_RESULT **res)
}
}

MGET_HTML_PARSE_RESULT *mget_html_get_urls_inline(const char *html)
MGET_HTML_PARSED_RESULT *mget_html_get_urls_inline(const char *html)
{
_HTML_CONTEXT context = { .result.follow = 1 };

// context.result.uris = mget_vector_create(32, -2, NULL);
mget_html_parse_buffer(html, _html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT);

return (MGET_HTML_PARSE_RESULT *)mget_memdup(&context.result, sizeof(context.result));
return mget_memdup(&context.result, sizeof(context.result));
}
31 changes: 31 additions & 0 deletions libmget/iri.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* URI/IRI routines
* about encoding see http://nikitathespider.com/articles/EncodingDivination.html
* about GET encoding see http://stackoverflow.com/questions/1549213/whats-the-correct-encoding-of-http-get-request-strings
* RFC 3986: URI generic syntax
*
*
* Changelog
Expand Down Expand Up @@ -334,6 +335,36 @@ mget_iri_t *mget_iri_parse(const char *url, const char *encoding)
return iri;
}

mget_iri_t *mget_iri_clone(mget_iri_t *iri)
{
if (!iri)
return NULL;

size_t slen = strlen(iri->uri);
mget_iri_t *clone = mget_memdup(iri, sizeof(mget_iri_t) + slen * 2 + 2);

clone->connection_part = mget_strdup(iri->connection_part);

// adjust pointers
if (iri->host_allocated)
clone->host = strdup(iri->host);
else
clone->host = iri->host ? (char *)clone + (size_t) (iri->host - (const char *)iri) : NULL;

clone->uri = iri->uri ? (char *)clone + (size_t) (iri->uri - (const char *)iri) : NULL;
clone->display = iri->display ? (char *)clone + (size_t) (iri->display - (const char *)iri): NULL;
// not adjust scheme, it is a pointer to a static string
clone->userinfo = iri->userinfo ? (char *)clone + (size_t) (iri->userinfo - (const char *)iri): NULL;
clone->password = iri->password ? (char *)clone + (size_t) (iri->password - (const char *)iri): NULL;
clone->port = iri->port ? (char *)clone + (size_t) (iri->port - (const char *)iri): NULL;
clone->resolv_port = iri->resolv_port ? (char *)clone + (size_t) (iri->resolv_port - (const char *)iri): NULL;
clone->path = iri->path ? (char *)clone + (size_t) (iri->path - (const char *)iri): NULL;
clone->query = iri->query ? (char *)clone + (size_t) (iri->query - (const char *)iri): NULL;
clone->fragment = iri->fragment ? (char *)clone + (size_t) (iri->fragment - (const char *)iri): NULL;

return clone;
}

static char *_iri_build_connection_part(mget_iri_t *iri)
{
char *tag;
Expand Down
Loading

0 comments on commit 7f131b0

Please sign in to comment.