From bb6553b00cb4b541205926d32f3078ab6ba074cf Mon Sep 17 00:00:00 2001 From: Jon Date: Sun, 27 Feb 2022 10:32:02 +0000 Subject: [PATCH] Initial public commit --- .gitignore | 5 + license.md | 23 ++++ readme.md | 47 +++++++ uorss.sh | 374 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 449 insertions(+) create mode 100644 .gitignore create mode 100644 license.md create mode 100644 readme.md create mode 100755 uorss.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a0bd3f --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +config.d +cache +*~ +~* +.#* diff --git a/license.md b/license.md new file mode 100644 index 0000000..8def8e7 --- /dev/null +++ b/license.md @@ -0,0 +1,23 @@ +# MIT+NOMORG License + +Copyright (c) 2022 Jonathan Hyde + +Permission is hereby granted on condition, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +You may not use the Software, or any portions of the Software, for and/or on behalf of the Matrix Foundation or any associated entities. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..d74052b --- /dev/null +++ b/readme.md @@ -0,0 +1,47 @@ +# uorss +Pixiv to RSS bridge written in bash. + +## Setup +1. `$ cd $UORSS_DIR && mkdir -p cache/artwork cache/feeds cache/pages cache/users config.d` +2. Add appropriate settings in config.d. `touch config.d/0_secrets.yaml config.d/1_config.yaml config.d/2_watchlist.yaml` +3. Add cronjob or systemd scheduled service. Set `$UORSS_DIR`! +4. Expose `feeds/` and `pages/` with nginx or httpd. You should use a dedicated root with symlinks to each, don't blindly expose your `$UORSS_DIR` directory! + +### `config` +```yaml +required: [ user_agent, session_id, base_url, pixiv_base_url, depth ] +properties: + user_agent: + comment: 'User Agent send for all requests. Must be a browser user agent otherwise requests wont go through' + type: string + session_id: + comment: 'Contents of PHPSESSID for a logged in pixiv session. You should put this in it's own config file' + type: string + base_url: + comment: 'The base url where you will expose feeds/ and pages/ on your server' + example: 'http://10.0.0.2/static/uorss' + type: string + pixiv_base_url: + comment: 'The base url for pixiv links that uorss generates (tags)' + example: 'https://www.pixiv.net/en' + type: string + depth: + comment: 'How many illustrations you want uorss to pull. Suggested value is 3. Note that uorss will not clean up stale illustrations it downloads!' + type: number + stale_mode: + comment: "Always use cached pixiv user responses. This is for testing so you don't spam the pixiv api with artist lookups" + suggested: false + type: boolean + cache_entry_files: + comment: 'If entry files should always regenerate. If uorss updates you should set this to false for the first run' + suggested: true + type: boolean + slow_mode: + comment: "Adds various delays while pulling from pixiv. Use if you're worried about being rate limited or while syncing a lot of artists" + type: boolean + artist: + comment: 'An array of pixiv artist ids. Uorss will not check for duplicates' + type: array + items: + type: string +``` diff --git a/uorss.sh b/uorss.sh new file mode 100755 index 0000000..9c3f45e --- /dev/null +++ b/uorss.sh @@ -0,0 +1,374 @@ +#!/usr/bin/env bash +VERSION="1.0.0" +CONFIG_DIR="$UORSS_DIR" +set -u +set +H + +# This will be removed, dont rely on it +[ -z "$CONFIG_DIR" ] && echo "main: UORSS_DIR is not defined, assuming working directory. This is potentially unsafe" +[ -z "$CONFIG_DIR" ] && CONFIG_DIR="$PWD" + +CONFIGD_DIR="$CONFIG_DIR/config.d" +CACHE_DIR="$CONFIG_DIR/cache" + +echo "main: Running as $(whoami). Version $VERSION. Directory at $CONFIG_DIR" +die () { echo "$@" 1>&2; exit 1; } +stripCtrlChars () { sed 's/\x1b/\\x1b/g' /dev/stdin; return $?; } + +[ -d "$CONFIG_DIR" ] || die "main: No uorss directory defined! See readme." +[ -d "$CONFIGD_DIR" ] || die "main: No config.d directory defined! See readme." + +str_startswith () { + # str_startsswith: prefix string + local alen + local blen + local end + alen=${#1} + blen=${#2} + + if [ "$alen" -gt "$blen" ]; then + return 1 + else + if [ "$1" == "${2:0:$alen}" ]; then + return 0 + else + return 1 + fi + fi +} + +str_endswith () { + # str_endswith: affix string + local alen + local blen + local end + alen=${#1} + blen=${#2} + + if [ "$alen" -gt "$blen" ]; then + return 1 + else + end=$((blen - alen)) + if [ "$1" == "${2:$end:$alen}" ]; then + return 0 + else + return 1 + fi + fi +} + + +# Merge all files in config.d/ +createConfig () { + # createConfig: + local filepath + local filebase + local json + local code + local config + config="{}" + + for filepath in "$CONFIGD_DIR/"*; do + filebase="$(basename "$filepath")" + if [ -d "$filepath" ]; then + echo "createConfig: Ignoring $filebase, Is a directory." 1>&2 + elif str_startswith "~" "$filebase" || str_endswith "~" "$filebase"; then + echo "createConfig: Ignoring $filebase. Is a temporary file!" 1>&2 + elif str_startswith "." "$filebase"; then + echo "createConfig: Ignoring $filebase. Is a hidden file!" 1>&2 + elif str_endswith ".yml" "$filebase" || str_endswith ".yaml" "$filebase" || str_endswith ".json" "$filebase"; then + json="$(yq . "$filepath" -c)" + code="$?" + if [ $code -eq 0 ]; then + config="$(jq --argjson a "$config" --argjson b "$json" \ + '$a * $b * { +artists: ($a.artists + $b.artists) +}' -nc)" || die "createConfig: Failed to merge $filebase with in-memory config. Exited with code $?" + else + echo "createConfig: Failed to load $filebase. Exited with code $code" 1>&2 + exit $code + fi + else + echo "createConfig: Ignoring $(basename "$filebase")." 1>&2 + fi + done + + jq . -c <<< "$config" +} + +CONFIG="$(createConfig)" || die "main: Failed to load config.d! Exited with $?" +SLOW_MODE=""; jq -e .slow_mode 1>/dev/null <<< "$CONFIG" && SLOW_MODE="1" +STALE_MODE=""; jq -e .stale_mode 1>/dev/null <<< "$CONFIG" && STALE_MODE="1" +CACHE_ENTRY_FILES=""; jq -e .cache_entry_files 1>/dev/null <<< "$CONFIG" && CACHE_ENTRY_FILES="1" + +DEPTH="$(jq .depth -r <<< "$CONFIG")" +BASE_URL="$(jq .base_url -r <<< "$CONFIG")" +PIXIV_BASE_URL="$(jq .pixiv_base_url -r <<< "$CONFIG")" +SESSION_ID="$(jq .session_id -r <<< "$CONFIG")" +USER_AGENT="$(jq .user_agent -r <<< "$CONFIG")" +COOKIES="Cookie: PHPSESSID=$SESSION_ID" + +[ "$BASE_URL" == "null" ] && die "main: base_url cannot be null!" +[ "$PIXIV_BASE_URL" == "null" ] && die "main: pixiv_base_url cannot be null!" +[ "$SESSION_ID" == "null" ] && die "main: session_id cannot be null!" + +[ -n "$STALE_MODE" ] && echo "main: Stale mode is on. New posts by artists will not be fetched!" 1>&2 + +arrayIndexList () { + # arrayIndexList: array + local length + length="$(jq 'length' -r <<< "$1")" + if [ -n "$length" ]; then + seq 1 "$length" + fi +} + +getArtworkData () { + # getArtworkData: artwork_id artist_id + local json_file + local json + json_file="$CACHE_DIR/artwork/$1.pages.json" + + if [ -f "$json_file" ]; then + echo "getArtworkData: Cache HIT for $2:$1" 1>&2 + else + echo "getArtworkData: Cache MISS for $2:$1" 1>&2 + curl -sf \ + -A "$USER_AGENT" \ + -H "$COOKIES" \ + -H "Referer: https://www.pixiv.net/" \ + "https://www.pixiv.net/ajax/illust/$1/pages" 2>/dev/null > "$json_file" + fi + cat "$json_file" +} + +getArtworkInfo () { + # getArtworkInfo: artwork_id artist_id + local json_file + local json + json_file="$CACHE_DIR/artwork/$1.info.json" + + if [ -f "$json_file" ]; then + echo "getArtworkInfo: Cache HIT for $2:$1" 1>&2 + else + echo "getArtworkInfo: Cache MISS for $2:$1" 1>&2 + curl -sf \ + -A "$USER_AGENT" \ + -H "$COOKIES" \ + -H "Referer: https://www.pixiv.net/" \ + "https://www.pixiv.net/ajax/illust/$1" 2>/dev/null > "$json_file" + fi + cat "$json_file" +} + +getArtistInfo () { + # getArtistInfo: artist_id + local json_file + local json + json_file="$CACHE_DIR/users/$1.info.json" + + if [ -f "$json_file" ]; then + echo "getArtworkInfo: Cache HIT for $1" 1>&2 + else + echo "getArtworkInfo: Cache MISS for $1" 1>&2 + curl -sf \ + -A "$USER_AGENT" \ + -H "$COOKIES" \ + -H "Referer: https://www.pixiv.net/" \ + "https://www.pixiv.net/ajax/user/$1" 2>/dev/null > "$json_file" + fi + cat "$json_file" +} + +hoistArtworkPage () { + # hoistArtworkPage: url artwork_id artist_id + local artist + local pages + local page + artist="$CACHE_DIR/pages/$3" + pages="$artist/$2" + page="$pages/$(basename "$1")" + json_file="$CACHE_DIR/artwork/$1.info.json" + + [ -d "$artist" ] || mkdir "$artist" + [ -d "$pages" ] || mkdir "$pages" + + + if [ -f "$page" ]; then + echo "hoistArtworkPage: Cache HIT for $3:$2 $1" 1>&2 + else + echo "hoistArtworkPage: Cache MISS for $3:$2 $1" 1>&2 + curl -sf \ + -A "$USER_AGENT" \ + -H "$COOKIES" \ + -H "Referer: https://www.pixiv.net/" \ + "$1" 2>/dev/null > "$page~" + mv "$page~" "$page" + return $? + fi +} + +hoistArtworkPages () { + # hoistArtworkPages: response artwork_id artist_id + local page_index + local page_json + local page_url + + for page_index in $(arrayIndexList "$(jq .body -c <<< "$1")"); do + page_json="$(jq ".body[$((page_index-1))]" -r <<< "$1")" || return 1 + page_url="$(jq .urls.original -r <<< "$page_json")" || return 2 + if [ "$page_url" == "null" ]; then + echo "$page_json" + return 3 + fi + + if ! hoistArtworkPage "$page_url" "$2" "$3"; then + echo "hoistArtworkPages: Failed for page $page_index." 1>&2 + rm "$CACHE_DIR/pages/$3/$2/$(basename "$page_url")" + fi + + if [ -n "$SLOW_MODE" ]; then sleep .3; fi + #ingestArtist "$artist_id" || die "init[$artist_id]: Unexpected exit code when processing artist. $?" + done + return 0 +} + +hoistArtworkEntry () { + # hoistArtworkEntry: artwork_id artist_id artwork_info_json artwork_pages_json + local file + local id + local href + local pages + local page_json + local page_url + local page_file + local tags + file="$CACHE_DIR/artwork/$1.entry.xml" + [ -n "$CACHE_ENTRY_FILES" ] && [ -f "$file" ] && return 0 + id="$PIXIV_BASE_URL/artworks/$(jq '.body.illustId | @uri' -r <<< "$3")" + href="$(jq '.body.extraData.meta.canonical | @html' -r <<< "$3")" + tags="$(jq '.body.tags.tags | map("" + (.tag | @html) + "") | join(", ")' -r <<< "$3")" + pages="" + + for page_index in $(arrayIndexList "$(jq .body -c <<< "$4")"); do + page_json="$(jq ".body[$((page_index-1))]" -r <<< "$4")" || return 2 + page_url="$(jq .urls.original -r <<< "$page_json")" || return 2 + page_file="$(jq @html -Rr <<< "$BASE_URL/pages/$2/$1/$(basename "$page_url")")" + pages="$pages"'

+' + done + + echo ' +'"$(jq '.body.illustTitle | @html' -r <<< "$3")"' + + +'"$id"' +'"$(jq '.body.uploadDate | @html' -r <<< "$3")"' +'"$(date -Is | jq @html -rR)"' + +
+

'"$tags"'

+'"$pages"'
+
+'"$(jq '.body.userName | @html' -r <<< "$3")"' +
' > "$file" +# '"$(jq '.body.alt | @html' -r <<< "$3")"' +} + +ingestArtist () { + # ingestArtist: artist_id + local artist_file + local artist_json + local artwork_keys + local artwork_id + local artwork_index + local artwork_pages_json + local artwork_info_json + local feed_url + local feed_file + local entries_file + local artist_url + local artist_name + artist_file="$CACHE_DIR/users/$1.json" + artist_url="$PIXIV_BASE_URL/users/$1" + feed_url="$BASE_URL/feeds/$1.xml" + feed_file="$CACHE_DIR/feeds/$1.xml" + entries_file="$CACHE_DIR/feeds/$1.entries.xml~" + + if [ -n "$STALE_MODE" ] && [ -f "$artist_file" ]; then + echo "ingestArtist: Cache HIT for $1" + else + echo "ingestArtist: Cache MISS for $1" 1>&2 + curl -sf \ + -A "$USER_AGENT" \ + -H "$COOKIES" \ + -H "Referer: https://www.pixiv.net/" \ + "https://www.pixiv.net/ajax/user/$1/works/latest?lang=en" 2>/dev/null > "$artist_file" + fi + + artist_info_json="$(getArtistInfo "$1")" || return 1 + artist_json="$(cat "$artist_file")" + + if jq -e '.error' 1>/dev/null <<< "$artist_info_json"; then + echo "ingestArtist: Error while reading artist $1 info. Message: $(jq .message -r <<< "$artist_info_json")" 1>&2 + return 1 + fi + + if jq -e '.error' 1>/dev/null <<< "$artist_json"; then + echo "ingestArtist: Error while reading artist $1. Message: $(jq .message -r <<< "$artist_json")" 1>&2 + return 1 + fi + + artwork_keys="$(jq ".body.illusts | keys | reverse | .[0:$DEPTH]" -c <<< "$artist_json")" + + echo '' > "$feed_file~" + for artwork_index in $(arrayIndexList "$artwork_keys"); do + artwork_id="$(jq ".[$((artwork_index-1))]" -r <<< "$artwork_keys")" || return 2 + artwork_info_json="$(getArtworkInfo "$artwork_id" "$1")" || return 3 + artwork_pages_json="$(getArtworkData "$artwork_id" "$1")" || return 4 + [ -n "$SLOW_MODE" ] && sleep .3 + + grep -Eq '^[0-9]+$' <<< "$artwork_id" || \ + die "ingestArtist: Exception while handling artist $1, artwork with index of $artwork_index. Illustration ID is not numerical!" + + hoistArtworkPages "$artwork_pages_json" "$artwork_id" "$artist_id" || echo "ingestArtist: Error while running hoistArtworkPages. Exit code $?" 1>&2 + + if hoistArtworkEntry "$artwork_id" "$artist_id" "$artwork_info_json" "$artwork_pages_json"; then + cat "$CACHE_DIR/artwork/$artwork_id.entry.xml" >> "$entries_file" + else + echo "ingestArtist: Error while running hoistArtworkEntry. Exit code $?" 1>&2 + fi + done + + artist_name="$(jq '.body.name | @html' -r <<< "$artist_info_json")" + echo ' + +'"$artist_name"' +'"$artist_name on Pixiv"' + + +'"$artist_url"' +'"$(date -Is)"' +'"$artist_name"' +' > "$feed_file~" + cat "$entries_file" >> "$feed_file~" + rm "$entries_file" + echo ' +' >> "$feed_file~" + mv "$feed_file~" "$feed_file" +} + +init () { + local artist_id + local artists_index + + for artists_index in $(arrayIndexList "$(jq '.artists' -c <<< "$CONFIG")"); do + artist_id="$(jq ".artists[$((artists_index-1))]" -r <<< "$CONFIG")" || return 1 + grep -Eq '^[0-9]+$' <<< "$artist_id" || die "init: Exception while handling artist $artist_id. Artist ID is not numerical!" + ingestArtist "$artist_id" || die "init[$artist_id]: Unexpected exit code when processing artist. $?" + [ -n "$SLOW_MODE" ] && sleep 1 + done + echo "init: End" +} + +init