Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ashfame committed Dec 2, 2024
1 parent 9a15f48 commit ba0be4d
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 5 deletions.
23 changes: 22 additions & 1 deletion src/plugin/class-controller-registry.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,29 @@

class Controller_Registry {

public function __construct( string $liberated_data_post_type, string $crawler_data_post_type ) {
public function __construct( string $liberated_data_post_type, string $crawler_queue_post_type ) {
new Blogpost_Controller( $liberated_data_post_type );
new Page_Controller( $liberated_data_post_type );

$domain = $this->infer_domain( $liberated_data_post_type );

new Crawler_Controller( $domain, $crawler_queue_post_type );
}

private function infer_domain( $liberated_data_post_type ): string {
$liberated_posts = get_posts(
array(
'post_type' => $liberated_data_post_type,
'posts_per_page' => 1,
'post_status' => 'draft',
)
);

if ( ! empty( $liberated_posts ) ) {
$domain = wp_parse_url( $liberated_posts[0]->guid, -1 );
return $domain['scheme'] . '://' . $domain['host'];
}

return '';
}
}
200 changes: 200 additions & 0 deletions src/plugin/class-crawler-controller.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
<?php

namespace DotOrg\TryWordPress;

use WP_Error;
use WP_REST_Controller;
use WP_REST_Response;
use WP_REST_Server;

class Crawler_Controller extends WP_REST_Controller {

/**
* Domain is inferred from liberated_data guid
*
* @var string $domain Domain/Host being liberated
*/
private string $domain = '';

private string $crawler_queue_post_type;

public function __construct( string $domain, string $crawler_queue_post_type ) {
$this->domain = $domain;
$this->crawler_queue_post_type = $crawler_queue_post_type;

add_action( 'rest_api_init', array( $this, 'register_routes' ) );
}

public function register_routes(): void {
$version = '1';
$namespace = 'try-wp/v' . $version;
register_rest_route(
$namespace,
'/crawler/next',
array(
array(
'methods' => WP_REST_Server::READABLE,
'callback' => array( $this, 'get_next_url' ),
'permission_callback' => '__return_true',
'args' => array(
'context' => array(
'default' => 'view',
),
),
),
)
);
register_rest_route(
$namespace,
'/crawler/queue',
array(
array(
'methods' => WP_REST_Server::READABLE,
'callback' => array( $this, 'queue_urls' ),
'permission_callback' => '__return_true',
// @TODO Specify args here so that sanitization is handled automatically
'args' => array(
'context' => array(
'default' => 'view',
),
),
),
)
);
}

public function get_next_url( $request ): WP_REST_Response|WP_Error {
$ready_to_crawl_urls = get_posts(
array(
'post_type' => $this->crawler_queue_post_type,
'posts_per_page' => 1,
'post_status' => 'discovered',
'orderby' => 'date',
'order' => 'ASC',
)
);

if ( empty( $ready_to_crawl_urls ) ) {
// have we finished crawling or haven't even started yet?
$crawled_urls = get_posts(
array(
'post_type' => $this->crawler_queue_post_type,
'posts_per_page' => 1,
'post_status' => 'crawled',
'orderby' => 'date',
'order' => 'ASC',
)
);

if ( empty( $crawled_urls ) ) {
// we haven't begun, so return domain itself
return new WP_REST_Response( $this->domain );
}

return new WP_REST_Response( null, 204 );
}

return new WP_REST_Response( $ready_to_crawl_urls[0]->guid );
}

public function queue_urls( $request ): WP_REST_Response|WP_Error {
$request_data = json_decode( $request->get_body(), true );

if ( empty( $request_data['sourceUrl'] ) ) {
return new WP_REST_Response( null, 400 );
}

$post_id = $this->get_post_id_by_guid( $request_data['sourceUrl'] );
if ( empty( $post_id ) ) {
return new WP_REST_Response( null, 404 );
}

$source_url = sanitize_url( $request_data['sourceUrl'] );
$marked = $this->mark_url_as_crawled( $source_url );
if ( is_wp_error( $marked ) ) {
return $marked;
}

foreach ( $request_data['urls']as $url ) {
$queued_result = $this->queue_url( $url );
if ( is_wp_error( $queued_result ) ) {
return $queued_result;
}
}

return new WP_REST_Response();
}

private function queue_url( string $url ): true|WP_Error {
$post_id = $this->get_post_id_by_guid( $url );

// insert only if it's not present
if ( empty( $post_id ) ) {
$inserted_post_id = wp_insert_post(
array(
'post_type' => $this->crawler_queue_post_type,
'guid' => sanitize_url( $url ),
),
true
);

if ( is_wp_error( $inserted_post_id ) ) {
return $inserted_post_id;
}

return true;
}

return true;
}

private function mark_url_as_crawled( $url ): true|WP_Error {
$post_id = $this->get_post_id_by_guid( $url );
$post = get_post( $post_id );
$post->post_status = 'crawled';
if ( wp_update_post( $post ) === $post->ID ) {
return true;
}

return new WP_Error(
'rest_save_failed',
__( 'Failed to update url as crawled', 'try_wordpress' ),
array( 'status' => 500 )
);
}

public function get_post_id_by_guid( string $guid ): ?int {
// Use wp_cache_* for guid -> postId
$cache_group = 'try_wp';
$cache_key = 'try_wp_crawler_cache_guid_' . md5( $guid );
$post_id = wp_cache_get( $cache_key, $cache_group );

if ( false !== $post_id ) {
// Cache hit - get post using WordPress API
$post = get_post( $post_id );
if ( $post ) {
return (int) $post_id;
}
// If post not found despite cache hit, delete the cache
wp_cache_delete( $cache_key, $cache_group );
}

// Cache miss - query database
global $wpdb;
// phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery
$post_id = $wpdb->get_var(
$wpdb->prepare(
"SELECT ID FROM $wpdb->posts WHERE guid = %s",
$guid
)
);

if ( $post_id ) {
// Cache the post ID for future lookups
wp_cache_set( $cache_key, $post_id, $cache_group, YEAR_IN_SECONDS );
return (int) $post_id;
}

return null;
}
}
9 changes: 5 additions & 4 deletions src/plugin/class-engine.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
class Engine {

public const string LIBERATED_DATA_POST_TYPE = 'liberated_data';
public const string CRAWLER_DATA_POST_TYPE = 'dl_crawler_url';
public const string CRAWLER_QUEUE_POST_TYPE = 'dl_crawler_url';

public function __construct() {
require 'enum-subject-type.php';
Expand All @@ -15,6 +15,7 @@ public function __construct() {
require 'class-liberate-controller.php';
require 'class-blogpost-controller.php';
require 'class-page-controller.php';
require 'class-crawler-controller.php';
require 'class-controller-registry.php';
require 'class-storage.php';
require 'class-subject.php';
Expand All @@ -23,11 +24,11 @@ public function __construct() {
( function () {
$transformer = new Transformer();

new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE, $transformer );
new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE, $transformer );

new Controller_Registry( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
new Controller_Registry( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE );

new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE );

Subject_Repo::init( self::LIBERATED_DATA_POST_TYPE );
} )();
Expand Down
10 changes: 10 additions & 0 deletions tests/plugin/base-test.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,13 @@
/**
* Setup for running tests would come here.
*/

// for crawler controller
wp_insert_post(
array(
'post_type' => \DotOrg\TryWordPress\Engine::LIBERATED_DATA_POST_TYPE,
'title' => 'something to avoid empty filter',
'guid' => 'https://example.org/1',
'post_status' => 'draft',
)
);
64 changes: 64 additions & 0 deletions tests/plugin/test-crawler-controller.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

use DotOrg\TryWordPress\Crawler_Controller;
use PHPUnit\Framework\TestCase;

class Crawler_Controller_Test extends TestCase {
private Crawler_Controller $crawler_controller;

private string $domain = 'https://example.org';
private string $namespace = 'try-wp/v1';
private string $endpoint;
private string $crawler_queue_post_type = 'dl_crawl';

protected function setUp(): void {
parent::setUp();

$this->endpoint = '/' . $this->namespace . '/crawler';

// Note: `base-test.php` sets a `liberated_data` post

$this->crawler_controller = new Crawler_Controller(
$this->domain,
$this->crawler_queue_post_type
);
}

public function testRegisterRoutes(): void {
// do_action( 'rest_api_init' ); // so that register_route() executes.

Check warning on line 28 in tests/plugin/test-crawler-controller.php

View workflow job for this annotation

GitHub Actions / phpcs

This comment is 45% valid code; is this commented out code?

$routes = rest_get_server()->get_routes( $this->namespace );
$this->assertArrayHasKey( $this->endpoint . '/next', $routes );
$this->assertArrayHasKey( $this->endpoint . '/queue', $routes );
}

/**

Check failure on line 35 in tests/plugin/test-crawler-controller.php

View workflow job for this annotation

GitHub Actions / phpcs

Missing short description in doc comment
* @group failing
*/
public function testGetNextUrlWithoutQueue(): void {
// first fetch should return the domain itself since that's the first url to crawl
$request = new WP_REST_Request( 'GET', $this->endpoint . '/next' );
$response = rest_do_request( $request );

$this->assertEquals( 200, $response->get_status() );
$this->assertEquals( $this->domain, $response->get_data() );

Check failure on line 44 in tests/plugin/test-crawler-controller.php

View workflow job for this annotation

GitHub Actions / PHPUnit 8.3

Failed asserting that two strings are equal.
}

public function testQueueUrls(): void {
// first fetch should return the domain itself since that's the first url to crawl
$request = new WP_REST_Request( 'GET', $this->endpoint . '/queue' );
$response = rest_do_request( $request );

$this->assertEquals( 200, $response->get_status() );

Check failure on line 52 in tests/plugin/test-crawler-controller.php

View workflow job for this annotation

GitHub Actions / PHPUnit 8.3

Failed asserting that 400 matches expected 200.
$this->assertEquals( $this->domain, $response->get_data() );
}

public function testGetNextUrlFromQueue(): void {
// first fetch should return the domain itself since that's the first url to crawl
$request = new WP_REST_Request( 'GET', $this->endpoint . '/next' );
$response = rest_do_request( $request );

$this->assertEquals( 200, $response->get_status() );
$this->assertEquals( $this->domain, $response->get_data() );

Check failure on line 62 in tests/plugin/test-crawler-controller.php

View workflow job for this annotation

GitHub Actions / PHPUnit 8.3

Failed asserting that two strings are equal.
}
}

0 comments on commit ba0be4d

Please sign in to comment.