Installing Dependencies (Optional)
1
|
$ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
1
2
|
$ python -m pip install --user virtualenv
$ python -m pip install --user virtualenvwrapper
|
1
2
3
|
export WORKON_HOME=$HOME/.virtualenvs
export VIRTUALENVWRAPPER_VIRTUALENV=$HOME/.local/bin/virtualenv
source $HOME/.local/bin/virtualenvwrapper.sh
|
1
|
$ mkvirtualenv stock-screener
|
1
|
pip install django djangorestframework markdown django-filter Scrapy scrapyd python-scrapyd-api scrapy-djangoitem proxybroker celery redis django-celery-results django-celery-beat
|
Creating a Django Project
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
$ django-admin startproject stock_screener
$ cd stock_screener
$ mkdir stock_screener/apps/basic_info
$ django-admin startapp basic_info stock_screener/apps/basic_info
$ mkdir stock_screener/apps/northbound
$ django-admin startapp northbound stock_screener/apps/northbound
$ mkdir stock_screener/apps/financial_metrics
$ django-admin startapp financial_metrics stock_screener/apps/financial_metrics
$ mkdir stock_screener/apps/quotes
$ django-admin startapp quotes stock_screener/apps/quotes
$ mkdir stock_screener/apps/jobs
$ django-admin startapp jobs stock_screener/apps/jobs
|
Cannot add backslash suffix when run startapp command.
1
2
3
4
5
6
7
|
INSTALLED_APPS = [
...
'rest_framework',
'stock_screener.apps.basic_info',
'stock_screener.apps.northbound',
'stock_screener.apps.financial_metrics',
]
|
1
2
|
$ python manage.py migrate
$ python manage.py createsuperuser --email admin@example.com --username admin
|
1
|
$ vim stock_screener/apps/basic_info/models.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
from django.db import models
class StockList(models.Model):
category = models.CharField(max_length=255)
code = models.CharField(max_length=255, primary_key=True)
name = models.CharField(max_length=255)
listing_date = models.DateField()
establish_date = models.DateField()
industry = models.CharField(max_length=255)
main_business = models.TextField()
class Meta:
ordering = ['code']
|
1
2
|
$ python manage.py makemigrations basic_info
$ python manage.py migrate basic_info
|
1
|
$ vim stock_screener/apps/northbound/models.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
from django.db import models
class NetFlow(models.Model):
code = models.CharField(max_length=255, primary_key=True)
name = models.CharField(max_length=255)
inflow = models.FloatField()
change_pct = models.FloatField()
five_inflow = models.FloatField()
five_change = models.FloatField()
twenty_inflow = models.FloatField()
twenty_change = models.FloatField()
sixty_inflow = models.FloatField()
sixty_change = models.FloatField()
cost = models.FloatField()
profit_percent = models.FloatField()
market_code = models.IntegerField()
shares = models.IntegerField()
class Meta:
ordering = ['inflow']
class ShareHolding(models.Model):
# concept code
code = models.CharField(max_length=255, primary_key=True)
# concept name
name = models.CharField(max_length=255)
inflow = models.FloatField()
change_pct = models.FloatField()
five_inflow = models.FloatField()
five_change = models.FloatField()
twenty_inflow = models.FloatField()
twenty_change = models.FloatField()
sixty_inflow = models.FloatField()
sixty_change = models.FloatField()
class Meta:
ordering = ['inflow']
|
1
2
|
$ python manage.py makemigrations northbound
$ python manage.py migrate northbound
|
1
|
$ vim stock_screener/apps/northbound/views.py
|
1
|
$ vim stock_screener/apps/northbound/urls.py
|
1
2
3
4
5
6
7
|
from django.urls import path
from stock_screener.apps.northbound import views
urlpatterns = [
path('netflow/', views.netflow_list),
path('netflow_concept/', views.netflow_concept_list),
]
|
1
|
$ vim stock_screener/urls.py
|
1
2
3
4
5
6
7
|
from django.contrib import admin
from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
include('', include('northbound.urls')),
]
|
1
|
$ vim stock_screener/apps/northbound/admin.py
|
1
2
3
4
|
from stock_screener.apps.northbound.models import NetFlow, NetFlowConcept
for model in (NetFlow, NetFlowConcept):
admin.site.register(model)
|
Scrapy settings set up
1
|
$ vim stock_screener/settings.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# integrate scrapy with django START
import os
from os.path import abspath, dirname, join
from sys import path
PROJECT_ROOT = abspath(dirname(__file__))
root = lambda *x: abspath(join(abspath(PROJECT_ROOT), *x))
SCRAPER_ROOT = root('apps/scraper')
path.append(SCRAPER_ROOT)
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'scraper.settings')
# integrate scrapy with django DONE
|
Creating the Scrapy Project
1
2
3
4
|
$ cd stock_screener/apps/
$ scrapy startproject scraper
$ cd scraper
$ scrapy genspider northbound <domain>
|
Django settings set up
1
|
$ vim scraper/settings.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# DJANGO INTEGRATION START
import os
from os.path import abspath, dirname, join
from sys import path
here = lambda *x: join(abspath(dirname(__file__)), *x)
PROJECT_ROOT = here('../../../../') # directory path of stock_screener
path.append(abspath(PROJECT_ROOT))
os.environ['DJANGO_SETTINGS_MODULE'] = 'stock_screener.settings'
# This is required only if Django Version > 1.8
# # https://docs.djangoproject.com/en/2.2/ref/applications/#troubleshooting
import django
django.setup()
# DJANGO INTEGRATION DONE
|
write scrapy project
1
|
$ vim stock_screener/apps/scraper/scraper/items.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
from scrapy_djangoitem import DjangoItem
from stock_screener.apps.northbound.models import NetFlowStocks, NetFlowConcepts, NetFlowStocksByConcept
class NetFlowStocksItem(DjangoItem):
django_model = NetFlowStocks
class NetFlowConceptsItem(DjangoItem):
django_model = NetFlowConcepts
class NetFlowStocksByConceptItem(DjangoItem):
django_model = NetFlowStocksByConcept
|
1
|
$ stock_screener/apps/scraper/scraper/spiders/northbound.py
|
1
|
$ vim stock_screener/apps/scraper/scraper/pipelines.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
class NorthboundPipeline(BasePipeline):
def open_spider(self, spider):
self.items = []
self.netflow_concepts = []
self.netflow_stocks_by_concept = []
def process_item(self, item, spider):
self.items.append(item)
return item
def close_spider(self, spider):
try:
with transaction.atomic():
NetFlowStocks.objects.all().delete()
NetFlowConcepts.objects.all().delete()
NetFlowStocksByConcept.objects.all().delete()
for item in self.items:
item.save()
except Exception as e:
spider.logger.error(e)
|
Using Celery with Django to call Scrapy
1
2
|
$ sudo pacman -S docker
$ systemctl start docker && systemctl enable docker
|
1
|
$ code /etc/docker/daemon.json
|
1
2
3
4
5
6
7
8
9
|
{
"storage-driver": "overlay2",
"registry-mirrors" : [
"http://docker.mirrors.ustc.edu.cn",
"http://registry.docker-cn.com",
"http://hub-mirror.c.163.com",
"http://ovfftd6p.mirror.aliyuncs.com"
]
}
|
1
|
$ sudo docker run -d -p 6379:6379 redis
|
1
|
$ vim stock_screener/settings.py
|
1
2
3
4
5
6
7
8
9
|
INSTALLED_APPS = [
...
'django_celery_results',
'django_celery_beat',
...
]
CELERY_BROKER_URL = os.environ.get('REDIS_URL', 'redis://localhost')
CELERY_RESULT_BACKEND = 'django-db'
|
1
|
$ python manage.py migrate django_celery_results
|
defines the Celery instance
1
|
$ vim stock_screener/celery.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
from __future__ import absolute_import, unicode_literals
import os
from celery import Celery
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'stock_screener.settings')
app = Celery('stock_screener')
app.config_from_object('django.conf:settings', namespace='CELERY')
# Solve Django + Celery + Scrapy twisted reactor(ReactorNotRestartable) errors.
app.conf.update(
worker_max_tasks_per_child=1,
broker_pool_limit=None
)
app.autodiscover_tasks()
@app.task(bind=True)
def debug_task(self):
print('Request: {0!r}'.format(self.request))
|
1
|
$ vim stock_screener/__init__.py
|
1
2
3
|
from .celery import app as celery_app
__all__ = ('celery_app',)
|
create tasks
1
|
$ vim stock_screener/apps/jobs/tasks.py
|
1
2
3
4
5
6
7
8
9
10
11
|
@shared_task
def scrape_task(spider_name):
"""Celery task to scrape website with Scrapy.
http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
"""
configure_logging()
s = get_project_settings()
s.update({'LOG_FILE': 'scrapy.log'})
process = CrawlerProcess(get_project_settings())
process.crawl(spider_name)
process.start()
|
1
|
$ vim stock_screener/apps/jobs/views.py
|
1
2
3
4
5
6
7
8
9
10
11
12
|
@csrf_exempt
@api_view(['POST'])
def scrape_job(request):
"""
Retrieve a scrapy job status, or or create a new scrapy job.
"""
spider_name = request.data.get('spider_name')
if spider_name in spider_names:
res = scrape_task.delay(spider_name)
return JsonResponse({'task_id': res.task_id, 'status': 'started'})
else:
return JsonResponse({'task_id': None, 'status': 'no found'})
|
starting the worker process
1
|
celery -A stock_screener worker -l info
|
1
|
$ celery -A stock_screener beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler
|
test
1
|
$ python manage.py runserver 0.0.0.0:8000
|
1
2
3
4
|
curl -X POST \
http://localhost:8000/jobs/scrape_job/ \
-H 'content-type: application/json' \
-d '{"spider_name": "northbound"}'
|
1
|
http://localhost:8000/admin/
|
Reference
-
https://medium.com/@ali_oguzhan/how-to-use-scrapy-with-django-application-c16fabd0e62e
-
https://blog.theodo.com/2019/01/data-scraping-scrapy-django-integration/
-
https://github.com/holgerd77/django-dynamic-scraper
-
https://github.com/snowunnotech/nicetomeetyou
-
https://github.com/richardcornish/jobboardscraper
-
https://github.com/cmwaura/Newspade