Installing Dependencies (Optional)

1
$ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
1
2
$ python -m pip install --user virtualenv
$ python -m pip install --user virtualenvwrapper
1
$ vim ~/.bashrc
1
2
3
export WORKON_HOME=$HOME/.virtualenvs
export VIRTUALENVWRAPPER_VIRTUALENV=$HOME/.local/bin/virtualenv
source $HOME/.local/bin/virtualenvwrapper.sh
1
$ source ~/.bashrc
1
$ mkvirtualenv stock-screener
1
pip install django djangorestframework markdown django-filter Scrapy scrapyd python-scrapyd-api scrapy-djangoitem proxybroker celery redis django-celery-results django-celery-beat

Creating a Django Project

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
$ django-admin startproject stock_screener
$ cd stock_screener

$ mkdir stock_screener/apps/basic_info
$ django-admin startapp basic_info stock_screener/apps/basic_info

$ mkdir stock_screener/apps/northbound
$ django-admin startapp northbound stock_screener/apps/northbound

$ mkdir stock_screener/apps/financial_metrics
$ django-admin startapp financial_metrics stock_screener/apps/financial_metrics

$ mkdir stock_screener/apps/quotes
$ django-admin startapp quotes stock_screener/apps/quotes

$ mkdir stock_screener/apps/jobs
$ django-admin startapp jobs stock_screener/apps/jobs

Cannot add backslash suffix when run startapp command.

1
2
3
4
5
6
7
INSTALLED_APPS = [
    ...
    'rest_framework',
    'stock_screener.apps.basic_info',
    'stock_screener.apps.northbound',
    'stock_screener.apps.financial_metrics',
]
1
2
$ python manage.py migrate
$ python manage.py createsuperuser --email admin@example.com --username admin
1
$ vim stock_screener/apps/basic_info/models.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from django.db import models

class StockList(models.Model):
    category = models.CharField(max_length=255)
    code = models.CharField(max_length=255, primary_key=True)
    name = models.CharField(max_length=255)
    listing_date = models.DateField()
    establish_date = models.DateField()
    industry = models.CharField(max_length=255)
    main_business = models.TextField()

    class Meta:
        ordering = ['code']
1
2
$ python manage.py makemigrations basic_info
$ python manage.py migrate basic_info
1
$ vim stock_screener/apps/northbound/models.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from django.db import models

class NetFlow(models.Model):
    code = models.CharField(max_length=255, primary_key=True)
    name = models.CharField(max_length=255)
    inflow = models.FloatField()
    change_pct = models.FloatField()
    five_inflow = models.FloatField()
    five_change = models.FloatField()
    twenty_inflow = models.FloatField()
    twenty_change = models.FloatField()
    sixty_inflow = models.FloatField()
    sixty_change = models.FloatField()
    cost = models.FloatField()
    profit_percent = models.FloatField()
    market_code = models.IntegerField()
    shares = models.IntegerField()

    class Meta:
        ordering = ['inflow']


class ShareHolding(models.Model):
    # concept code
    code = models.CharField(max_length=255, primary_key=True)
    # concept name
    name = models.CharField(max_length=255)
    inflow = models.FloatField()
    change_pct = models.FloatField()
    five_inflow = models.FloatField()
    five_change = models.FloatField()
    twenty_inflow = models.FloatField()
    twenty_change = models.FloatField()
    sixty_inflow = models.FloatField()
    sixty_change = models.FloatField()

    class Meta:
        ordering = ['inflow']
1
2
$ python manage.py makemigrations northbound
$ python manage.py migrate northbound
1
$ vim stock_screener/apps/northbound/views.py
1
$ vim stock_screener/apps/northbound/urls.py
1
2
3
4
5
6
7
from django.urls import path
from stock_screener.apps.northbound import views

urlpatterns = [
    path('netflow/', views.netflow_list),
    path('netflow_concept/', views.netflow_concept_list),
]
1
$ vim stock_screener/urls.py
1
2
3
4
5
6
7
from django.contrib import admin
from django.urls import path, include

urlpatterns = [
    path('admin/', admin.site.urls),
    include('', include('northbound.urls')),
]
1
$ vim stock_screener/apps/northbound/admin.py
1
2
3
4
from stock_screener.apps.northbound.models import NetFlow, NetFlowConcept

for model in (NetFlow, NetFlowConcept):
    admin.site.register(model)

Scrapy settings set up

1
$ vim stock_screener/settings.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# integrate scrapy with django START

import os
from os.path import abspath, dirname, join
from sys import path

PROJECT_ROOT = abspath(dirname(__file__))
root = lambda *x: abspath(join(abspath(PROJECT_ROOT), *x))
SCRAPER_ROOT = root('apps/scraper')
path.append(SCRAPER_ROOT)

os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'scraper.settings')

# integrate scrapy with django DONE

Creating the Scrapy Project

1
2
3
4
$ cd stock_screener/apps/
$ scrapy startproject scraper
$ cd scraper
$ scrapy genspider northbound <domain>

Django settings set up

1
$ vim scraper/settings.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# DJANGO INTEGRATION START

import os
from os.path import abspath, dirname, join
from sys import path

here = lambda *x: join(abspath(dirname(__file__)), *x)
PROJECT_ROOT = here('../../../../')     #  directory path of stock_screener
path.append(abspath(PROJECT_ROOT))
os.environ['DJANGO_SETTINGS_MODULE'] = 'stock_screener.settings'

# This is required only if Django Version > 1.8
# # https://docs.djangoproject.com/en/2.2/ref/applications/#troubleshooting
import django
django.setup()

# DJANGO INTEGRATION DONE

write scrapy project

1
$ vim stock_screener/apps/scraper/scraper/items.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from scrapy_djangoitem import DjangoItem
from stock_screener.apps.northbound.models import NetFlowStocks, NetFlowConcepts, NetFlowStocksByConcept

class NetFlowStocksItem(DjangoItem):
    django_model = NetFlowStocks


class NetFlowConceptsItem(DjangoItem):
    django_model = NetFlowConcepts


class NetFlowStocksByConceptItem(DjangoItem):
    django_model = NetFlowStocksByConcept
1
$ stock_screener/apps/scraper/scraper/spiders/northbound.py
1
create a spider here
1
$ vim stock_screener/apps/scraper/scraper/pipelines.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
class NorthboundPipeline(BasePipeline):

    def open_spider(self, spider):
        self.items = []
        self.netflow_concepts = []
        self.netflow_stocks_by_concept = []

    def process_item(self, item, spider):
        self.items.append(item)
        return item

    def close_spider(self, spider):
        try:
            with transaction.atomic():
                NetFlowStocks.objects.all().delete()
                NetFlowConcepts.objects.all().delete()
                NetFlowStocksByConcept.objects.all().delete()
                for item in self.items:
                    item.save()
        except Exception as e:
            spider.logger.error(e)

Using Celery with Django to call Scrapy

1
2
$ sudo pacman -S docker
$ systemctl start docker && systemctl enable docker
1
$ code /etc/docker/daemon.json
1
2
3
4
5
6
7
8
9
{
  "storage-driver": "overlay2",
  "registry-mirrors" : [
    "http://docker.mirrors.ustc.edu.cn",
    "http://registry.docker-cn.com",
    "http://hub-mirror.c.163.com",
    "http://ovfftd6p.mirror.aliyuncs.com"
  ]
}
1
$ sudo docker run -d -p 6379:6379 redis
1
$ vim stock_screener/settings.py
1
2
3
4
5
6
7
8
9
INSTALLED_APPS = [
    ...
    'django_celery_results',
    'django_celery_beat',
    ...
]

CELERY_BROKER_URL = os.environ.get('REDIS_URL', 'redis://localhost')
CELERY_RESULT_BACKEND = 'django-db'
1
$ python manage.py migrate django_celery_results

defines the Celery instance

1
$ vim stock_screener/celery.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
from __future__ import absolute_import, unicode_literals
import os
from celery import Celery

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'stock_screener.settings')
app = Celery('stock_screener')
app.config_from_object('django.conf:settings', namespace='CELERY')


# Solve Django + Celery + Scrapy twisted reactor(ReactorNotRestartable) errors.
app.conf.update(
    worker_max_tasks_per_child=1,
    broker_pool_limit=None
)

app.autodiscover_tasks()

@app.task(bind=True)
def debug_task(self):
    print('Request: {0!r}'.format(self.request))
1
$ vim stock_screener/__init__.py
1
2
3
from .celery import app as celery_app

__all__ = ('celery_app',)

create tasks

1
$ vim stock_screener/apps/jobs/tasks.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
@shared_task
def scrape_task(spider_name):
    """Celery task to scrape website with Scrapy.
    http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
    """
    configure_logging()
    s = get_project_settings()
    s.update({'LOG_FILE': 'scrapy.log'})
    process = CrawlerProcess(get_project_settings())
    process.crawl(spider_name)
    process.start()
1
$ vim stock_screener/apps/jobs/views.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
@csrf_exempt
@api_view(['POST'])
def scrape_job(request):
    """
    Retrieve a scrapy job status, or or create a new scrapy job.
    """
    spider_name = request.data.get('spider_name')
    if spider_name in spider_names:
        res = scrape_task.delay(spider_name)
        return JsonResponse({'task_id': res.task_id, 'status': 'started'})
    else:
        return JsonResponse({'task_id': None, 'status': 'no found'})

starting the worker process

1
celery -A stock_screener worker -l info
1
$ celery -A stock_screener beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler

test

1
$ python manage.py runserver 0.0.0.0:8000
1
2
3
4
curl -X POST \
  http://localhost:8000/jobs/scrape_job/ \
  -H 'content-type: application/json' \
  -d '{"spider_name": "northbound"}'
1
http://localhost:8000/admin/

Reference

  1. https://medium.com/@ali_oguzhan/how-to-use-scrapy-with-django-application-c16fabd0e62e

  2. https://blog.theodo.com/2019/01/data-scraping-scrapy-django-integration/

  3. https://github.com/holgerd77/django-dynamic-scraper

  4. https://github.com/snowunnotech/nicetomeetyou

  5. https://github.com/richardcornish/jobboardscraper

  6. https://github.com/cmwaura/Newspade